Import Upstream version 0.16.0

author: qinxialei <xialeiqin@gmail.com> 2020-10-29 11:26:59 +0800
committer: qinxialei <xialeiqin@gmail.com> 2020-10-29 11:26:59 +0800
commit: e8d277081293b6fb2a5d469616baaa7a06f52496 (patch)
tree: 1179bb07d3927d1837d4a90bd81b2034c4c696a9
download: libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.tar.gz
libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.tar.bz2
libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.zip
273 files changed, 102925 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..87ccf24
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/build
+/third_party
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..d92ea0a
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,6 @@
+# This is the list of libgav1 authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..5d00ae6
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,126 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# libgav1 requires modern CMake.
+cmake_minimum_required(VERSION 3.7.1 FATAL_ERROR)
+
+# libgav1 requires C++11.
+set(CMAKE_CXX_STANDARD 11)
+set(ABSL_CXX_STANDARD 11)
+
+project(libgav1 CXX)
+
+set(libgav1_root "${CMAKE_CURRENT_SOURCE_DIR}")
+set(libgav1_build "${CMAKE_BINARY_DIR}")
+
+if("${libgav1_root}" STREQUAL "${libgav1_build}")
+  message(
+    FATAL_ERROR
+      "Building from within the libgav1 source tree is not supported.\n"
+      "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+      "$ mkdir -p ../libgav1_build\n" "$ cd ../libgav1_build\n"
+      "And re-run CMake from the libgav1_build directory.")
+endif()
+
+set(libgav1_examples "${libgav1_root}/examples")
+set(libgav1_source "${libgav1_root}/src")
+
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+               "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING
+               "Enables avx2 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+               VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+               "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(
+  NAME LIBGAV1_VERBOSE HELPSTRING
+  "Enables verbose build system output. Higher numbers are more verbose." VALUE
+  OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+libgav1_optimization_detect()
+libgav1_set_build_definitions()
+libgav1_set_cxx_flags()
+libgav1_configure_sanitizer()
+
+# Supported bit depth.
+libgav1_track_configuration_variable(LIBGAV1_MAX_BITDEPTH)
+
+# C++ and linker flags.
+libgav1_track_configuration_variable(LIBGAV1_CXX_FLAGS)
+libgav1_track_configuration_variable(LIBGAV1_EXE_LINKER_FLAGS)
+
+# Sanitizer integration.
+libgav1_track_configuration_variable(LIBGAV1_SANITIZE)
+
+# Generated source file directory.
+libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+
+# Controls use of std::mutex and absl::Mutex in ThreadPool.
+libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+
+if(LIBGAV1_VERBOSE)
+  libgav1_dump_cmake_flag_variables()
+  libgav1_dump_tracked_configuration_variables()
+  libgav1_dump_options()
+endif()
+
+set(libgav1_abseil_build "${libgav1_build}/abseil")
+set(libgav1_gtest_build "${libgav1_build}/gtest")
+
+# Compiler/linker flags must be lists, but come in from the environment as
+# strings. Break them up:
+if(NOT "${LIBGAV1_CXX_FLAGS}" STREQUAL "")
+  separate_arguments(LIBGAV1_CXX_FLAGS)
+endif()
+if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
+  separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
+endif()
+
+add_subdirectory("${libgav1_root}/third_party/abseil-cpp"
+                 "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+
+libgav1_reset_target_lists()
+libgav1_add_dsp_targets()
+libgav1_add_decoder_targets()
+libgav1_add_examples_targets()
+libgav1_add_utils_targets()
+libgav1_setup_install_target()
+
+if(LIBGAV1_VERBOSE)
+  libgav1_dump_cmake_flag_variables()
+  libgav1_dump_tracked_configuration_variables()
+  libgav1_dump_options()
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..69140ff
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,27 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
+https://chromium-review.googlesource.com for this purpose.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8ab8eab
--- /dev/null
+++ b/README.md
@@ -0,0 +1,168 @@
+# libgav1 -- an AV1 decoder
+
+libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More
+information on the AV1 video format can be found at
+[aomedia.org](https://aomedia.org).
+
+[TOC]
+
+## Building
+
+### Prerequisites
+
+1.  A C++11 compiler. gcc 6+, clang 7+ or Microsoft Visual Studio 2017+ are
+    recommended.
+
+2.  [CMake >= 3.7.1](https://cmake.org/download/)
+
+3.  [Abseil](https://abseil.io)
+
+    From within the libgav1 directory:
+
+    ```shell
+      $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+    ```
+
+### Compile
+
+```shell
+  $ mkdir build && cd build
+  $ cmake -G "Unix Makefiles" ..
+  $ make
+```
+
+Configuration options:
+
+*   `LIBGAV1_MAX_BITDEPTH`: defines the maximum supported bitdepth (8, 10;
+    default: 10).
+*   `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS`: define to a non-zero value to disable
+    [symbol reduction](#symbol-reduction) in an optimized build to keep all
+    versions of dsp functions available. Automatically defined in
+    `src/dsp/dsp.h` if unset.
+*   `LIBGAV1_ENABLE_AVX2`: define to a non-zero value to enable avx2
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+*   `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+*   `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset. Note
+    setting this to 0 will also disable AVX2.
+*   `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
+    Automatically defined in `src/utils/logging.h` if unset.
+*   `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
+    the examples. Automatically defined in `examples/logging.h` if unset.
+*   `LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK`: define to 1 to enable transform
+    coefficient range checks.
+*   `LIBGAV1_LOG_LEVEL`: controls the maximum allowed log level, see `enum
+    LogSeverity` in `src/utils/logging.h`. Automatically defined in
+    `src/utils/logging.cc` if unset.
+*   `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
+    absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
+    dependency from the core library. Automatically defined in
+    `src/utils/threadpool.h` if unset.
+*   `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+    allowed to create. Has to be an integer > 0. Otherwise this is ignored.
+    The default value is 128.
+*   `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
+    is used to determine when to use frame parallel decoding. Frame parallel
+    decoding will be used if |threads| > |tile_count| * this multiplier. Has to
+    be an integer > 0. The default value is 4. This is an advanced setting
+    intended for testing purposes.
+
+For additional options see:
+
+```shell
+  $ cmake .. -LH
+```
+
+## Testing
+
+*   `gav1_decode` can be used to decode IVF files, see `gav1_decode --help` for
+    options. Note: tools like [FFmpeg](https://ffmpeg.org) can be used to
+    convert other container formats to IVF.
+
+## Development
+
+### Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to submit patches.
+
+### Style
+
+libgav1 follows the
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html) with
+formatting enforced by `clang-format`.
+
+### Comments
+
+Comments of the form '`// X.Y(.Z).`', '`Section X.Y(.Z).`' or '`... in the
+spec`' reference the relevant section(s) in the
+[AV1 specification](http://aomediacodec.github.io/av1-spec/av1-spec.pdf).
+
+### DSP structure
+
+*   `src/dsp/dsp.cc` defines the main entry point: `libgav1::dsp::DspInit()`.
+    This handles cpu-detection and initializing each logical unit which populate
+    `libgav1::dsp::Dsp` function tables.
+*   `src/dsp/dsp.h` contains function and type definitions for all logical units
+    (e.g., intra-predictors)
+*   `src/utils/cpu.h` contains definitions for cpu-detection
+*   base implementations are located in `src/dsp/*.{h,cc}` with platform
+    specific optimizations in sub-folders
+*   unit tests define `DISABLED_Speed` test(s) to allow timing of individual
+    functions
+
+#### Symbol reduction
+
+Based on the build configuration unneeded lesser optimizations are removed using
+a hierarchical include and define system. Each logical unit in `src/dsp` should
+include all platform specific headers in descending order to allow higher level
+optimizations to disable lower level ones. See `src/dsp/loop_filter.h` for an
+example.
+
+Each function receives a new define which can be checked in platform specific
+headers. The format is: `LIBGAV1_<Dsp-table>_FunctionName` or
+`LIBGAV1_<Dsp-table>_[sub-table-index1][...-indexN]`, e.g.,
+`LIBGAV1_Dsp8bpp_AverageBlend`,
+`LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc`. The Dsp-table name is of
+the form `Dsp<bitdepth>bpp` e.g. `Dsp10bpp` for bitdepth == 10 (bpp stands for
+bits per pixel). The indices correspond to enum values used as lookups with
+leading 'k' removed. Platform specific headers then should first check if the
+symbol is defined and if not set the value to the corresponding
+`LIBGAV1_CPU_<arch>` value from `src/utils/cpu.h`.
+
+```
+  #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+  #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+  #endif
+```
+
+Within each module the code should check if the symbol is defined to its
+specific architecture or forced via `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS` before
+defining the function. The `DSP_ENABLED_(8|10)BPP_*` macros are available to
+simplify this check for optimized code.
+
+```
+  #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  ...
+
+  // In unoptimized code use the following structure; there's no equivalent
+  // define for LIBGAV1_CPU_C as it would require duplicating the function
+  // defines used in optimized code for only a small benefit to this
+  // boilerplate.
+  #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  ...
+  #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+  ...
+```
+
+## Bugs
+
+Please report all bugs to the issue tracker:
+https://issuetracker.google.com/issues/new?component=750480&template=1355007
+
+## Discussion
+
+Email: gav1-devel@googlegroups.com
+
+Web: https://groups.google.com/forum/#!forum/gav1-devel
diff --git a/cmake/libgav1-config.cmake.template b/cmake/libgav1-config.cmake.template
new file mode 100644
index 0000000..dc253d3
--- /dev/null
+++ b/cmake/libgav1-config.cmake.template
@@ -0,0 +1,2 @@
+set(LIBGAV1_INCLUDE_DIRS "@LIBGAV1_INCLUDE_DIRS@")
+set(LIBGAV1_LIBRARIES "gav1")
diff --git a/cmake/libgav1.pc.template b/cmake/libgav1.pc.template
new file mode 100644
index 0000000..c571a43
--- /dev/null
+++ b/cmake/libgav1.pc.template
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: @PROJECT_NAME@
+Description: AV1 decoder library (@LIBGAV1_MAX_BITDEPTH@-bit).
+Version: @LIBGAV1_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lgav1
+Libs.private: @CMAKE_THREAD_LIBS_INIT@
diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake
new file mode 100644
index 0000000..b170e7e
--- /dev/null
+++ b/cmake/libgav1_build_definitions.cmake
@@ -0,0 +1,150 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ 1)
+
+macro(libgav1_set_build_definitions)
+  string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+
+  libgav1_load_version_info()
+  set(LIBGAV1_SOVERSION 0)
+
+  list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
+              "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
+  list(APPEND libgav1_gtest_include_paths
+              "third_party/googletest/googlemock/include"
+              "third_party/googletest/googletest/include"
+              "third_party/googletest/googletest")
+  list(APPEND libgav1_test_include_paths ${libgav1_include_paths}
+              ${libgav1_gtest_include_paths})
+  list(APPEND libgav1_defines "LIBGAV1_CMAKE=1"
+              "LIBGAV1_FLAGS_SRCDIR=\"${libgav1_root}\""
+              "LIBGAV1_FLAGS_TMPDIR=\"/tmp\"")
+
+  if(MSVC OR WIN32)
+    list(APPEND libgav1_defines "_CRT_SECURE_NO_DEPRECATE=1" "NOMINMAX=1")
+  endif()
+
+  if(ANDROID)
+    if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
+      set(CMAKE_ANDROID_ARM_MODE ON)
+    endif()
+
+    if(build_type_lowercase MATCHES "rel")
+      list(APPEND libgav1_base_cxx_flags "-fno-stack-protector")
+    endif()
+  endif()
+
+  list(APPEND libgav1_base_cxx_flags "-Wall" "-Wextra" "-Wmissing-declarations"
+              "-Wno-sign-compare" "-fvisibility=hidden"
+              "-fvisibility-inlines-hidden")
+
+  if(BUILD_SHARED_LIBS)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    set(libgav1_dependency libgav1_shared)
+  else()
+    set(libgav1_dependency libgav1_static)
+  endif()
+
+  list(APPEND libgav1_clang_cxx_flags "-Wextra-semi" "-Wmissing-prototypes"
+              "-Wshorten-64-to-32")
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+      # Quiet warnings in copy-list-initialization where {} elision has always
+      # been allowed.
+      list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+    endif()
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8)
+      list(APPEND libgav1_clang_cxx_flags "-Wextra-semi-stmt")
+    endif()
+  endif()
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
+      # Quiet warnings due to potential snprintf() truncation in threadpool.cc.
+      list(APPEND libgav1_base_cxx_flags "-Wno-format-truncation")
+
+      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7")
+        # Quiet gcc 6 vs 7 abi warnings:
+        # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+        list(APPEND libgav1_base_cxx_flags "-Wno-psabi")
+        list(APPEND ABSL_GCC_FLAGS "-Wno-psabi")
+      endif()
+    endif()
+  endif()
+
+  if(build_type_lowercase MATCHES "rel")
+    # TODO(tomfinegan): this value is only a concern for the core library and
+    # can be made smaller if the test targets are avoided.
+    list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608")
+  endif()
+
+  list(APPEND libgav1_msvc_cxx_flags
+              # Warning level 3.
+              "/W3"
+              # Disable warning C4018:
+              # '<comparison operator>' signed/unsigned mismatch
+              "/wd4018"
+              # Disable warning C4244:
+              # 'argument': conversion from '<double/int>' to
+              # '<float/smaller int type>', possible loss of data
+              "/wd4244"
+              # Disable warning C4267:
+              # '=': conversion from '<double/int>' to
+              # '<float/smaller int type>', possible loss of data
+              "/wd4267"
+              # Disable warning C4309:
+              # 'argument': truncation of constant value
+              "/wd4309"
+              # Disable warning C4551:
+              # function call missing argument list
+              "/wd4551")
+
+  if(BUILD_SHARED_LIBS)
+    list(APPEND libgav1_msvc_cxx_flags
+                # Disable warning C4251:
+                # 'libgav1::DecoderImpl class member' needs to have
+                # dll-interface to be used by clients of class
+                # 'libgav1::Decoder'.
+                "/wd4251")
+  endif()
+
+  if(NOT LIBGAV1_MAX_BITDEPTH)
+    set(LIBGAV1_MAX_BITDEPTH 10)
+  elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10)
+    libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.")
+  endif()
+
+  list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
+
+  if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+    if(NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 0
+       AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 1)
+      libgav1_die("LIBGAV1_THREADPOOL_USE_STD_MUTEX must be 0 or 1.")
+    endif()
+
+    list(APPEND libgav1_defines
+         "LIBGAV1_THREADPOOL_USE_STD_MUTEX=${LIBGAV1_THREADPOOL_USE_STD_MUTEX}")
+  endif()
+
+  # Source file names ending in these suffixes will have the appropriate
+  # compiler flags added to their compile commands to enable intrinsics.
+  set(libgav1_avx2_source_file_suffix "avx2.cc")
+  set(libgav1_neon_source_file_suffix "neon.cc")
+  set(libgav1_sse4_source_file_suffix "sse4.cc")
+endmacro()
diff --git a/cmake/libgav1_cpu_detection.cmake b/cmake/libgav1_cpu_detection.cmake
new file mode 100644
index 0000000..e17e27c
--- /dev/null
+++ b/cmake/libgav1_cpu_detection.cmake
@@ -0,0 +1,49 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ 1)
+
+# Detect optimizations available for the current target CPU.
+macro(libgav1_optimization_detect)
+  if(LIBGAV1_ENABLE_OPTIMIZATIONS)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+    if(cpu_lowercase MATCHES "^arm|^aarch64")
+      set(libgav1_have_neon ON)
+    elseif(cpu_lowercase MATCHES "^x86|amd64")
+      set(libgav1_have_avx2 ON)
+      set(libgav1_have_sse4 ON)
+    endif()
+  endif()
+
+  if(libgav1_have_avx2 AND LIBGAV1_ENABLE_AVX2)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=0")
+  endif()
+
+  if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=0")
+  endif()
+
+  if(libgav1_have_sse4 AND LIBGAV1_ENABLE_SSE4_1)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=0")
+  endif()
+endmacro()
diff --git a/cmake/libgav1_flags.cmake b/cmake/libgav1_flags.cmake
new file mode 100644
index 0000000..2d8d9a6
--- /dev/null
+++ b/cmake/libgav1_flags.cmake
@@ -0,0 +1,251 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ 1)
+
+include(CheckCXXCompilerFlag)
+include(CheckCXXSourceCompiles)
+
+# Adds compiler flags specified by FLAGS to the sources specified by SOURCES:
+#
+# libgav1_set_compiler_flags_for_sources(SOURCES <sources> FLAGS <flags>)
+macro(libgav1_set_compiler_flags_for_sources)
+  unset(compiler_SOURCES)
+  unset(compiler_FLAGS)
+  unset(optional_args)
+  unset(single_value_args)
+  set(multi_value_args SOURCES FLAGS)
+  cmake_parse_arguments(compiler "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (compiler_SOURCES AND compiler_FLAGS))
+    libgav1_die("libgav1_set_compiler_flags_for_sources: SOURCES and "
+                "FLAGS required.")
+  endif()
+
+  set_source_files_properties(${compiler_SOURCES} PROPERTIES COMPILE_FLAGS
+                              ${compiler_FLAGS})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    foreach(source ${compiler_SOURCES})
+      foreach(flag ${compiler_FLAGS})
+        message("libgav1_set_compiler_flags_for_sources: source:${source} "
+                "flag:${flag}")
+      endforeach()
+    endforeach()
+  endif()
+endmacro()
+
+# Tests compiler flags stored in list(s) specified by FLAG_LIST_VAR_NAMES, adds
+# flags to $LIBGAV1_CXX_FLAGS when tests pass. Terminates configuration if
+# FLAG_REQUIRED is specified and any flag check fails.
+#
+# ~~~
+# libgav1_test_cxx_flag(<FLAG_LIST_VAR_NAMES <flag list variable(s)>>
+#                       [FLAG_REQUIRED])
+# ~~~
+macro(libgav1_test_cxx_flag)
+  unset(cxx_test_FLAG_LIST_VAR_NAMES)
+  unset(cxx_test_FLAG_REQUIRED)
+  unset(single_value_args)
+  set(optional_args FLAG_REQUIRED)
+  set(multi_value_args FLAG_LIST_VAR_NAMES)
+  cmake_parse_arguments(cxx_test "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT cxx_test_FLAG_LIST_VAR_NAMES)
+    libgav1_die("libgav1_test_cxx_flag: FLAG_LIST_VAR_NAMES required")
+  endif()
+
+  unset(cxx_flags)
+  foreach(list_var ${cxx_test_FLAG_LIST_VAR_NAMES})
+    if(LIBGAV1_VERBOSE)
+      message("libgav1_test_cxx_flag: adding ${list_var} to cxx_flags")
+    endif()
+    list(APPEND cxx_flags ${${list_var}})
+  endforeach()
+
+  if(LIBGAV1_VERBOSE)
+    message("CXX test: all flags: ${cxx_flags}")
+  endif()
+
+  unset(all_cxx_flags)
+  list(APPEND all_cxx_flags ${LIBGAV1_CXX_FLAGS} ${cxx_flags})
+
+  # Turn off output from check_cxx_source_compiles. Print status directly
+  # instead since the logging messages from check_cxx_source_compiles can be
+  # quite confusing.
+  set(CMAKE_REQUIRED_QUIET TRUE)
+
+  # Run the actual compile test.
+  unset(libgav1_all_cxx_flags_pass CACHE)
+  message("--- Running combined CXX flags test, flags: ${all_cxx_flags}")
+  check_cxx_compiler_flag("${all_cxx_flags}" libgav1_all_cxx_flags_pass)
+
+  if(cxx_test_FLAG_REQUIRED AND NOT libgav1_all_cxx_flags_pass)
+    libgav1_die("Flag test failed for required flag(s): "
+                "${all_cxx_flags} and FLAG_REQUIRED specified.")
+  endif()
+
+  if(libgav1_all_cxx_flags_pass)
+    # Test passed: update the global flag list used by the libgav1 target
+    # creation wrappers.
+    set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+    list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+
+    if(LIBGAV1_VERBOSE)
+      message("LIBGAV1_CXX_FLAGS=${LIBGAV1_CXX_FLAGS}")
+    endif()
+
+    message("--- Passed combined CXX flags test")
+  else()
+    message("--- Failed combined CXX flags test, testing flags individually.")
+
+    if(cxx_flags)
+      message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
+      foreach(cxx_flag ${cxx_flags})
+        # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal
+        # variable at parent scope while check_cxx_source_compiles() continues
+        # to set an internal cache variable, so we unset both to avoid the
+        # failure / success state persisting between checks. See
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+        unset(cxx_flag_test_passed)
+        unset(cxx_flag_test_passed CACHE)
+        message("--- Testing flag: ${cxx_flag}")
+        check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
+
+        if(cxx_flag_test_passed)
+          message("--- Passed test for ${cxx_flag}")
+        else()
+          list(REMOVE_ITEM cxx_flags ${cxx_flag})
+          message("--- Failed test for ${cxx_flag}, flag removed.")
+        endif()
+      endforeach()
+
+      set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+    endif()
+  endif()
+
+  if(LIBGAV1_CXX_FLAGS)
+    list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+  endif()
+endmacro()
+
+# Tests executable linker flags stored in list specified by FLAG_LIST_VAR_NAME,
+# adds flags to $LIBGAV1_EXE_LINKER_FLAGS when test passes. Terminates
+# configuration when flag check fails. libgav1_set_cxx_flags() must be called
+# before calling this macro because it assumes $LIBGAV1_CXX_FLAGS contains only
+# valid CXX flags.
+#
+# libgav1_test_exe_linker_flag(<FLAG_LIST_VAR_NAME <flag list variable)>)
+macro(libgav1_test_exe_linker_flag)
+  unset(link_FLAG_LIST_VAR_NAME)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args FLAG_LIST_VAR_NAME)
+  cmake_parse_arguments(link "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT link_FLAG_LIST_VAR_NAME)
+    libgav1_die("libgav1_test_link_flag: FLAG_LIST_VAR_NAME required")
+  endif()
+
+  libgav1_set_and_stringify(DEST linker_flags SOURCE_VARS
+                            ${link_FLAG_LIST_VAR_NAME})
+
+  if(LIBGAV1_VERBOSE)
+    message("EXE LINKER test: all flags: ${linker_flags}")
+  endif()
+
+  # Tests of $LIBGAV1_CXX_FLAGS have already passed. Include them with the
+  # linker test.
+  libgav1_set_and_stringify(DEST CMAKE_REQUIRED_FLAGS SOURCE_VARS
+                            LIBGAV1_CXX_FLAGS)
+
+  # Cache the global exe linker flags.
+  if(CMAKE_EXE_LINKER_FLAGS)
+    set(cached_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
+    libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE
+                              ${linker_flags})
+  endif()
+
+  libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE ${linker_flags}
+                            ${CMAKE_EXE_LINKER_FLAGS})
+
+  # Turn off output from check_cxx_source_compiles. Print status directly
+  # instead since the logging messages from check_cxx_source_compiles can be
+  # quite confusing.
+  set(CMAKE_REQUIRED_QUIET TRUE)
+
+  message("--- Running EXE LINKER test for flags: ${linker_flags}")
+
+  unset(linker_flag_test_passed CACHE)
+  set(libgav1_cxx_main "\nint main() { return 0; }")
+  check_cxx_source_compiles("${libgav1_cxx_main}" linker_flag_test_passed)
+
+  if(NOT linker_flag_test_passed)
+    libgav1_die("EXE LINKER test failed.")
+  endif()
+
+  message("--- Passed EXE LINKER flag test.")
+
+  # Restore cached global exe linker flags.
+  if(cached_CMAKE_EXE_LINKER_FLAGS)
+    set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS)
+  else()
+    unset(CMAKE_EXE_LINKER_FLAGS)
+  endif()
+endmacro()
+
+# Runs the libgav1 compiler tests. This macro builds up the list of list var(s)
+# that is passed to libgav1_test_cxx_flag().
+#
+# Note: libgav1_set_build_definitions() must be called before this macro.
+macro(libgav1_set_cxx_flags)
+  unset(cxx_flag_lists)
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+    list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
+  endif()
+
+  # Append clang flags after the base set to allow -Wno* overrides to take
+  # effect. Some of the base flags may enable a large set of warnings, e.g.,
+  # -Wall.
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+  endif()
+
+  if(MSVC)
+    list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
+  endif()
+
+  if(LIBGAV1_VERBOSE)
+    if(cxx_flag_lists)
+      libgav1_set_and_stringify(DEST cxx_flags SOURCE_VARS ${cxx_flag_lists})
+      message("libgav1_set_cxx_flags: internal CXX flags: ${cxx_flags}")
+    endif()
+  endif()
+
+  if(LIBGAV1_CXX_FLAGS)
+    list(APPEND cxx_flag_lists LIBGAV1_CXX_FLAGS)
+    if(LIBGAV1_VERBOSE)
+      message("libgav1_set_cxx_flags: user CXX flags: ${LIBGAV1_CXX_FLAGS}")
+    endif()
+  endif()
+
+  libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
+endmacro()
diff --git a/cmake/libgav1_helpers.cmake b/cmake/libgav1_helpers.cmake
new file mode 100644
index 0000000..76d8d67
--- /dev/null
+++ b/cmake/libgav1_helpers.cmake
@@ -0,0 +1,134 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
+
+# Kills build generation using message(FATAL_ERROR) and outputs all data passed
+# to the console via use of $ARGN.
+macro(libgav1_die)
+  message(FATAL_ERROR ${ARGN})
+endmacro()
+
+# Converts semi-colon delimited list variable(s) to string. Output is written to
+# variable supplied via the DEST parameter. Input is from an expanded variable
+# referenced by SOURCE and/or variable(s) referenced by SOURCE_VARS.
+macro(libgav1_set_and_stringify)
+  set(optional_args)
+  set(single_value_args DEST SOURCE_VAR)
+  set(multi_value_args SOURCE SOURCE_VARS)
+  cmake_parse_arguments(sas "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT sas_DEST OR NOT (sas_SOURCE OR sas_SOURCE_VARS))
+    libgav1_die("libgav1_set_and_stringify: DEST and at least one of SOURCE "
+                "SOURCE_VARS required.")
+  endif()
+
+  unset(${sas_DEST})
+
+  if(sas_SOURCE)
+    # $sas_SOURCE is one or more expanded variables, just copy the values to
+    # $sas_DEST.
+    set(${sas_DEST} "${sas_SOURCE}")
+  endif()
+
+  if(sas_SOURCE_VARS)
+    # $sas_SOURCE_VARS is one or more variable names. Each iteration expands a
+    # variable and appends it to $sas_DEST.
+    foreach(source_var ${sas_SOURCE_VARS})
+      set(${sas_DEST} "${${sas_DEST}} ${${source_var}}")
+    endforeach()
+
+    # Because $sas_DEST can be empty when entering this scope leading whitespace
+    # can be introduced to $sas_DEST on the first iteration of the above loop.
+    # Remove it:
+    string(STRIP "${${sas_DEST}}" ${sas_DEST})
+  endif()
+
+  # Lists in CMake are simply semicolon delimited strings, so stringification is
+  # just a find and replace of the semicolon.
+  string(REPLACE ";" " " ${sas_DEST} "${${sas_DEST}}")
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("libgav1_set_and_stringify: ${sas_DEST}=${${sas_DEST}}")
+  endif()
+endmacro()
+
+# Creates a dummy source file in $LIBGAV1_GENERATED_SOURCES_DIRECTORY and adds
+# it to the specified target. Optionally adds its path to a list variable.
+#
+# libgav1_create_dummy_source_file(<TARGET <target> BASENAME <basename of file>>
+# [LISTVAR <list variable>])
+macro(libgav1_create_dummy_source_file)
+  set(optional_args)
+  set(single_value_args TARGET BASENAME LISTVAR)
+  set(multi_value_args)
+  cmake_parse_arguments(cdsf "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT cdsf_TARGET OR NOT cdsf_BASENAME)
+    libgav1_die(
+      "libgav1_create_dummy_source_file: TARGET and BASENAME required.")
+  endif()
+
+  if(NOT LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+    set(LIBGAV1_GENERATED_SOURCES_DIRECTORY "${libgav1_build}/gen_src")
+  endif()
+
+  set(dummy_source_dir "${LIBGAV1_GENERATED_SOURCES_DIRECTORY}")
+  set(dummy_source_file
+      "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
+  set(dummy_source_code
+      "// Generated file. DO NOT EDIT!\n"
+      "// C++ source file created for target ${cdsf_TARGET}. \n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
+  file(WRITE "${dummy_source_file}" "${dummy_source_code}")
+
+  target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
+
+  if(cdsf_LISTVAR)
+    list(APPEND ${cdsf_LISTVAR} "${dummy_source_file}")
+  endif()
+endmacro()
+
+# Loads the version components from $libgav1_source/gav1/version.h and sets the
+# corresponding CMake variables:
+# - LIBGAV1_MAJOR_VERSION
+# - LIBGAV1_MINOR_VERSION
+# - LIBGAV1_PATCH_VERSION
+# - LIBGAV1_VERSION, which is:
+#   - $LIBGAV1_MAJOR_VERSION.$LIBGAV1_MINOR_VERSION.$LIBGAV1_PATCH_VERSION
+macro(libgav1_load_version_info)
+  file(STRINGS "${libgav1_source}/gav1/version.h" version_file_strings)
+  foreach(str ${version_file_strings})
+    if(str MATCHES "#define LIBGAV1_")
+      if(str MATCHES "#define LIBGAV1_MAJOR_VERSION ")
+        string(REPLACE "#define LIBGAV1_MAJOR_VERSION " "" LIBGAV1_MAJOR_VERSION
+                       "${str}")
+      elseif(str MATCHES "#define LIBGAV1_MINOR_VERSION ")
+        string(REPLACE "#define LIBGAV1_MINOR_VERSION " "" LIBGAV1_MINOR_VERSION
+                       "${str}")
+      elseif(str MATCHES "#define LIBGAV1_PATCH_VERSION ")
+        string(REPLACE "#define LIBGAV1_PATCH_VERSION " "" LIBGAV1_PATCH_VERSION
+                       "${str}")
+      endif()
+    endif()
+  endforeach()
+  set(LIBGAV1_VERSION "${LIBGAV1_MAJOR_VERSION}.${LIBGAV1_MINOR_VERSION}")
+  set(LIBGAV1_VERSION "${LIBGAV1_VERSION}.${LIBGAV1_PATCH_VERSION}")
+endmacro()
diff --git a/cmake/libgav1_install.cmake b/cmake/libgav1_install.cmake
new file mode 100644
index 0000000..b7f6006
--- /dev/null
+++ b/cmake/libgav1_install.cmake
@@ -0,0 +1,60 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ 1)
+
+# Sets up the Libgav1 install targets. Must be called after the static library
+# target is created.
+macro(libgav1_setup_install_target)
+  if(NOT (MSVC OR XCODE))
+    include(GNUInstallDirs)
+
+    # pkg-config: libgav1.pc
+    set(prefix "${CMAKE_INSTALL_PREFIX}")
+    set(exec_prefix "\${prefix}")
+    set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
+    set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    set(libgav1_lib_name "libgav1")
+
+    configure_file("${libgav1_root}/cmake/libgav1.pc.template"
+                   "${libgav1_build}/libgav1.pc" @ONLY NEWLINE_STYLE UNIX)
+    install(FILES "${libgav1_build}/libgav1.pc"
+            DESTINATION "${prefix}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+    # CMake config: libgav1-config.cmake
+    set(LIBGAV1_INCLUDE_DIRS "${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    configure_file("${libgav1_root}/cmake/libgav1-config.cmake.template"
+                   "${libgav1_build}/libgav1-config.cmake" @ONLY
+                   NEWLINE_STYLE UNIX)
+    install(
+      FILES "${libgav1_build}/libgav1-config.cmake"
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/cmake")
+
+    install(
+      FILES ${libgav1_api_includes}
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
+
+    install(TARGETS gav1_decode DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+    install(TARGETS libgav1_static DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+    if(BUILD_SHARED_LIBS)
+      install(TARGETS libgav1_shared DESTINATION
+                      "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_intrinsics.cmake b/cmake/libgav1_intrinsics.cmake
new file mode 100644
index 0000000..a2e9ddb
--- /dev/null
+++ b/cmake/libgav1_intrinsics.cmake
@@ -0,0 +1,135 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ 1)
+
+# Returns the compiler flag for the SIMD intrinsics suffix specified by the
+# SUFFIX argument via the variable specified by the VARIABLE argument:
+# libgav1_get_intrinsics_flag_for_suffix(SUFFIX <suffix> VARIABLE <var name>)
+macro(libgav1_get_intrinsics_flag_for_suffix)
+  unset(intrinsics_SUFFIX)
+  unset(intrinsics_VARIABLE)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args SUFFIX VARIABLE)
+  cmake_parse_arguments(intrinsics "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (intrinsics_SUFFIX AND intrinsics_VARIABLE))
+    message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: SUFFIX and "
+                        "VARIABLE required.")
+  endif()
+
+  if(intrinsics_SUFFIX MATCHES "neon")
+    if(NOT MSVC)
+      set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
+    endif()
+  elseif(intrinsics_SUFFIX MATCHES "avx2")
+    if(MSVC)
+      set(${intrinsics_VARIABLE} "/arch:AVX2")
+    else()
+      set(${intrinsics_VARIABLE} "-mavx2")
+    endif()
+  elseif(intrinsics_SUFFIX MATCHES "sse4")
+    if(NOT MSVC)
+      set(${intrinsics_VARIABLE} "-msse4.1")
+    endif()
+  else()
+    message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: Unknown "
+                        "instrinics suffix: ${intrinsics_SUFFIX}")
+  endif()
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("libgav1_get_intrinsics_flag_for_suffix: "
+            "suffix:${intrinsics_SUFFIX} flag:${${intrinsics_VARIABLE}}")
+  endif()
+endmacro()
+
+# Processes source files specified by SOURCES and adds intrinsics flags as
+# necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
+#
+# Detects requirement for intrinsics flags using source file name suffix.
+# Currently supports AVX2 and SSE4.1.
+macro(libgav1_process_intrinsics_sources)
+  unset(arg_TARGET)
+  unset(arg_SOURCES)
+  unset(optional_args)
+  set(single_value_args TARGET)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(arg "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+  if(NOT (arg_TARGET AND arg_SOURCES))
+    message(FATAL_ERROR "libgav1_process_intrinsics_sources: TARGET and "
+                        "SOURCES required.")
+  endif()
+
+  if(LIBGAV1_ENABLE_AVX2 AND libgav1_have_avx2)
+    unset(avx2_sources)
+    list(APPEND avx2_sources ${arg_SOURCES})
+
+    list(FILTER avx2_sources INCLUDE REGEX
+         "${libgav1_avx2_source_file_suffix}$")
+
+    if(avx2_sources)
+      unset(avx2_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_avx2_source_file_suffix}
+                                             VARIABLE avx2_flags)
+      if(avx2_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${avx2_sources} FLAGS
+                                               ${avx2_flags})
+      endif()
+    endif()
+  endif()
+
+  if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
+    unset(sse4_sources)
+    list(APPEND sse4_sources ${arg_SOURCES})
+
+    list(FILTER sse4_sources INCLUDE REGEX
+         "${libgav1_sse4_source_file_suffix}$")
+
+    if(sse4_sources)
+      unset(sse4_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_sse4_source_file_suffix}
+                                             VARIABLE sse4_flags)
+      if(sse4_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${sse4_sources} FLAGS
+                                               ${sse4_flags})
+      endif()
+    endif()
+  endif()
+
+  if(LIBGAV1_ENABLE_NEON AND libgav1_have_neon)
+    unset(neon_sources)
+    list(APPEND neon_sources ${arg_SOURCES})
+    list(FILTER neon_sources INCLUDE REGEX
+         "${libgav1_neon_source_file_suffix}$")
+
+    if(neon_sources AND LIBGAV1_NEON_INTRINSICS_FLAG)
+      unset(neon_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_neon_source_file_suffix}
+                                             VARIABLE neon_flags)
+      if(neon_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${neon_sources} FLAGS
+                                               ${neon_flags})
+      endif()
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_options.cmake b/cmake/libgav1_options.cmake
new file mode 100644
index 0000000..6327bee
--- /dev/null
+++ b/cmake/libgav1_options.cmake
@@ -0,0 +1,55 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+
+# Simple wrapper for CMake's builtin option command that tracks libgav1's build
+# options in the list variable $libgav1_options.
+macro(libgav1_option)
+  unset(option_NAME)
+  unset(option_HELPSTRING)
+  unset(option_VALUE)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args NAME HELPSTRING VALUE)
+  cmake_parse_arguments(option "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (option_NAME AND option_HELPSTRING AND DEFINED option_VALUE))
+    message(FATAL_ERROR "libgav1_option: NAME HELPSTRING and VALUE required.")
+  endif()
+
+  option(${option_NAME} ${option_HELPSTRING} ${option_VALUE})
+
+  if(LIBGAV1_VERBOSE GREATER 2)
+    message("--------- libgav1_option ---------\n"
+            "option_NAME=${option_NAME}\n"
+            "option_HELPSTRING=${option_HELPSTRING}\n"
+            "option_VALUE=${option_VALUE}\n"
+            "------------------------------------------\n")
+  endif()
+
+  list(APPEND libgav1_options ${option_NAME})
+  list(REMOVE_DUPLICATES libgav1_options)
+endmacro()
+
+# Dumps the $libgav1_options list via CMake message command.
+macro(libgav1_dump_options)
+  foreach(option_name ${libgav1_options})
+    message("${option_name}: ${${option_name}}")
+  endforeach()
+endmacro()
diff --git a/cmake/libgav1_sanitizer.cmake b/cmake/libgav1_sanitizer.cmake
new file mode 100644
index 0000000..4bb2263
--- /dev/null
+++ b/cmake/libgav1_sanitizer.cmake
@@ -0,0 +1,45 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ 1)
+
+macro(libgav1_configure_sanitizer)
+  if(LIBGAV1_SANITIZE AND NOT MSVC)
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      if(LIBGAV1_SANITIZE MATCHES "cfi")
+        list(APPEND LIBGAV1_CXX_FLAGS "-flto" "-fno-sanitize-trap=cfi")
+        list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-flto" "-fno-sanitize-trap=cfi"
+                    "-fuse-ld=gold")
+      endif()
+
+      if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+         AND LIBGAV1_SANITIZE MATCHES "integer|undefined")
+        list(APPEND LIBGAV1_EXE_LINKER_FLAGS "--rtlib=compiler-rt" "-lgcc_s")
+      endif()
+    endif()
+
+    list(APPEND LIBGAV1_CXX_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+    list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+
+    # Make sanitizer callstacks accurate.
+    list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
+                "-fno-optimize-sibling-calls")
+
+    libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+    libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+  endif()
+endmacro()
diff --git a/cmake/libgav1_targets.cmake b/cmake/libgav1_targets.cmake
new file mode 100644
index 0000000..78b4865
--- /dev/null
+++ b/cmake/libgav1_targets.cmake
@@ -0,0 +1,347 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_
+set(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ 1)
+
+# Resets list variables used to track libgav1 targets.
+macro(libgav1_reset_target_lists)
+  unset(libgav1_targets)
+  unset(libgav1_exe_targets)
+  unset(libgav1_lib_targets)
+  unset(libgav1_objlib_targets)
+  unset(libgav1_sources)
+  unset(libgav1_test_targets)
+endmacro()
+
+# Creates an executable target. The target name is passed as a parameter to the
+# NAME argument, and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_test(NAME <name> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+#   - OUTPUT_NAME: Override output file basename. Target basename defaults to
+#     NAME.
+#   - TEST: Flag. Presence means treat executable as a test.
+#   - DEFINES: List of preprocessor macro definitions.
+#   - INCLUDES: list of include directories for the target.
+#   - COMPILE_FLAGS: list of compiler flags for the target.
+#   - LINK_FLAGS: List of linker flags for the target.
+#   - OBJLIB_DEPS: List of CMake object library target dependencies.
+#   - LIB_DEPS: List of CMake library dependencies.
+# cmake-format: on
+#
+# Sources passed to this macro are added to $libgav1_test_sources when TEST is
+# specified. Otherwise sources are added to $libgav1_sources.
+#
+# Targets passed to this macro are always added $libgav1_targets. When TEST is
+# specified targets are also added to list $libgav1_test_targets. Otherwise
+# targets are added to $libgav1_exe_targets.
+macro(libgav1_add_executable)
+  unset(exe_TEST)
+  unset(exe_TEST_DEFINES_MAIN)
+  unset(exe_NAME)
+  unset(exe_OUTPUT_NAME)
+  unset(exe_SOURCES)
+  unset(exe_DEFINES)
+  unset(exe_INCLUDES)
+  unset(exe_COMPILE_FLAGS)
+  unset(exe_LINK_FLAGS)
+  unset(exe_OBJLIB_DEPS)
+  unset(exe_LIB_DEPS)
+  set(optional_args TEST)
+  set(single_value_args NAME OUTPUT_NAME)
+  set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+                       OBJLIB_DEPS LIB_DEPS)
+
+  cmake_parse_arguments(exe "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("--------- libgav1_add_executable ---------\n"
+            "exe_TEST=${exe_TEST}\n"
+            "exe_TEST_DEFINES_MAIN=${exe_TEST_DEFINES_MAIN}\n"
+            "exe_NAME=${exe_NAME}\n"
+            "exe_OUTPUT_NAME=${exe_OUTPUT_NAME}\n"
+            "exe_SOURCES=${exe_SOURCES}\n"
+            "exe_DEFINES=${exe_DEFINES}\n"
+            "exe_INCLUDES=${exe_INCLUDES}\n"
+            "exe_COMPILE_FLAGS=${exe_COMPILE_FLAGS}\n"
+            "exe_LINK_FLAGS=${exe_LINK_FLAGS}\n"
+            "exe_OBJLIB_DEPS=${exe_OBJLIB_DEPS}\n"
+            "exe_LIB_DEPS=${exe_LIB_DEPS}\n"
+            "------------------------------------------\n")
+  endif()
+
+  if(NOT (exe_NAME AND exe_SOURCES))
+    message(FATAL_ERROR "libgav1_add_executable: NAME and SOURCES required.")
+  endif()
+
+  list(APPEND libgav1_targets ${exe_NAME})
+  if(exe_TEST)
+    list(APPEND libgav1_test_targets ${exe_NAME})
+    list(APPEND libgav1_test_sources ${exe_SOURCES})
+  else()
+    list(APPEND libgav1_exe_targets ${exe_NAME})
+    list(APPEND libgav1_sources ${exe_SOURCES})
+  endif()
+
+  add_executable(${exe_NAME} ${exe_SOURCES})
+
+  if(exe_OUTPUT_NAME)
+    set_target_properties(${exe_NAME} PROPERTIES OUTPUT_NAME ${exe_OUTPUT_NAME})
+  endif()
+
+  libgav1_process_intrinsics_sources(TARGET ${exe_NAME} SOURCES ${exe_SOURCES})
+
+  if(exe_DEFINES)
+    target_compile_definitions(${exe_NAME} PRIVATE ${exe_DEFINES})
+  endif()
+
+  if(exe_INCLUDES)
+    target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
+  endif()
+
+  if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+    target_compile_options(${exe_NAME}
+                           PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
+    set_target_properties(${exe_NAME}
+                          PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS}
+                                     ${LIBGAV1_EXE_LINKER_FLAGS})
+  endif()
+
+  if(exe_OBJLIB_DEPS)
+    foreach(objlib_dep ${exe_OBJLIB_DEPS})
+      target_sources(${exe_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+    endforeach()
+  endif()
+
+  if(CMAKE_THREAD_LIBS_INIT)
+    list(APPEND exe_LIB_DEPS ${CMAKE_THREAD_LIBS_INIT})
+  endif()
+
+  if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+    target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+  endif()
+
+  if(exe_LIB_DEPS)
+    unset(exe_static)
+    if("${CMAKE_EXE_LINKER_FLAGS} ${LIBGAV1_EXE_LINKER_FLAGS}" MATCHES "static")
+      set(exe_static ON)
+    endif()
+
+    if(exe_static AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+      # Third party dependencies can introduce dependencies on system and test
+      # libraries. Since the target created here is an executable, and CMake
+      # does not provide a method of controlling order of link dependencies,
+      # wrap all of the dependencies of this target in start/end group flags to
+      # ensure that dependencies of third party targets can be resolved when
+      # those dependencies happen to be resolved by dependencies of the current
+      # target.
+      list(INSERT exe_LIB_DEPS 0 -Wl,--start-group)
+      list(APPEND exe_LIB_DEPS -Wl,--end-group)
+    endif()
+    target_link_libraries(${exe_NAME} PRIVATE ${exe_LIB_DEPS})
+  endif()
+endmacro()
+
+# Creates a library target of the specified type. The target name is passed as a
+# parameter to the NAME argument, the type as a parameter to the TYPE argument,
+# and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_library(NAME <name> TYPE <type> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+#   - OUTPUT_NAME: Override output file basename. Target basename defaults to
+#     NAME. OUTPUT_NAME is ignored when BUILD_SHARED_LIBS is enabled and CMake
+#     is generating a build for which MSVC or WIN32 are true. This is to avoid
+#     output basename collisions with DLL import libraries.
+#   - TEST: Flag. Presence means treat library as a test.
+#   - DEFINES: List of preprocessor macro definitions.
+#   - INCLUDES: list of include directories for the target.
+#   - COMPILE_FLAGS: list of compiler flags for the target.
+#   - LINK_FLAGS: List of linker flags for the target.
+#   - OBJLIB_DEPS: List of CMake object library target dependencies.
+#   - LIB_DEPS: List of CMake library dependencies.
+#   - PUBLIC_INCLUDES: List of include paths to export to dependents.
+# cmake-format: on
+#
+# Sources passed to the macro are added to the lists tracking libgav1 sources:
+# cmake-format: off
+#   - When TEST is specified sources are added to $libgav1_test_sources.
+#   - Otherwise sources are added to $libgav1_sources.
+# cmake-format: on
+#
+# Targets passed to this macro are added to the lists tracking libgav1 targets:
+# cmake-format: off
+#   - Targets are always added to $libgav1_targets.
+#   - When the TEST flag is specified, targets are added to
+#     $libgav1_test_targets.
+#   - When TEST is not specified:
+#     - Libraries of type SHARED are added to $libgav1_dylib_targets.
+#     - Libraries of type OBJECT are added to $libgav1_objlib_targets.
+#     - Libraries of type STATIC are added to $libgav1_lib_targets.
+# cmake-format: on
+macro(libgav1_add_library)
+  unset(lib_TEST)
+  unset(lib_NAME)
+  unset(lib_OUTPUT_NAME)
+  unset(lib_TYPE)
+  unset(lib_SOURCES)
+  unset(lib_DEFINES)
+  unset(lib_INCLUDES)
+  unset(lib_COMPILE_FLAGS)
+  unset(lib_LINK_FLAGS)
+  unset(lib_OBJLIB_DEPS)
+  unset(lib_LIB_DEPS)
+  unset(lib_PUBLIC_INCLUDES)
+  set(optional_args TEST)
+  set(single_value_args NAME OUTPUT_NAME TYPE)
+  set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+                       OBJLIB_DEPS LIB_DEPS PUBLIC_INCLUDES)
+
+  cmake_parse_arguments(lib "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("--------- libgav1_add_library ---------\n"
+            "lib_TEST=${lib_TEST}\n"
+            "lib_NAME=${lib_NAME}\n"
+            "lib_OUTPUT_NAME=${lib_OUTPUT_NAME}\n"
+            "lib_TYPE=${lib_TYPE}\n"
+            "lib_SOURCES=${lib_SOURCES}\n"
+            "lib_DEFINES=${lib_DEFINES}\n"
+            "lib_INCLUDES=${lib_INCLUDES}\n"
+            "lib_COMPILE_FLAGS=${lib_COMPILE_FLAGS}\n"
+            "lib_LINK_FLAGS=${lib_LINK_FLAGS}\n"
+            "lib_OBJLIB_DEPS=${lib_OBJLIB_DEPS}\n"
+            "lib_LIB_DEPS=${lib_LIB_DEPS}\n"
+            "lib_PUBLIC_INCLUDES=${lib_PUBLIC_INCLUDES}\n"
+            "---------------------------------------\n")
+  endif()
+
+  if(NOT (lib_NAME AND lib_TYPE AND lib_SOURCES))
+    message(FATAL_ERROR "libgav1_add_library: NAME, TYPE and SOURCES required.")
+  endif()
+
+  list(APPEND libgav1_targets ${lib_NAME})
+  if(lib_TEST)
+    list(APPEND libgav1_test_targets ${lib_NAME})
+    list(APPEND libgav1_test_sources ${lib_SOURCES})
+  else()
+    list(APPEND libgav1_sources ${lib_SOURCES})
+    if(lib_TYPE STREQUAL OBJECT)
+      list(APPEND libgav1_objlib_targets ${lib_NAME})
+    elseif(lib_TYPE STREQUAL SHARED)
+      list(APPEND libgav1_dylib_targets ${lib_NAME})
+    elseif(lib_TYPE STREQUAL STATIC)
+      list(APPEND libgav1_lib_targets ${lib_NAME})
+    else()
+      message(WARNING "libgav1_add_library: Unhandled type: ${lib_TYPE}")
+    endif()
+  endif()
+
+  add_library(${lib_NAME} ${lib_TYPE} ${lib_SOURCES})
+  libgav1_process_intrinsics_sources(TARGET ${lib_NAME} SOURCES ${lib_SOURCES})
+
+  if(lib_OUTPUT_NAME)
+    if(NOT (BUILD_SHARED_LIBS AND (MSVC OR WIN32)))
+      set_target_properties(${lib_NAME}
+                            PROPERTIES OUTPUT_NAME ${lib_OUTPUT_NAME})
+    endif()
+  endif()
+
+  if(lib_DEFINES)
+    target_compile_definitions(${lib_NAME} PRIVATE ${lib_DEFINES})
+  endif()
+
+  if(lib_INCLUDES)
+    target_include_directories(${lib_NAME} PRIVATE ${lib_INCLUDES})
+  endif()
+
+  if(lib_PUBLIC_INCLUDES)
+    target_include_directories(${lib_NAME} PUBLIC ${lib_PUBLIC_INCLUDES})
+  endif()
+
+  if(lib_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+    target_compile_options(${lib_NAME}
+                           PRIVATE ${lib_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(lib_LINK_FLAGS)
+    set_target_properties(${lib_NAME} PROPERTIES LINK_FLAGS ${lib_LINK_FLAGS})
+  endif()
+
+  if(lib_OBJLIB_DEPS)
+    foreach(objlib_dep ${lib_OBJLIB_DEPS})
+      target_sources(${lib_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+    endforeach()
+  endif()
+
+  if(lib_LIB_DEPS)
+    if(lib_TYPE STREQUAL STATIC)
+      set(link_type PUBLIC)
+    else()
+      set(link_type PRIVATE)
+      if(lib_TYPE STREQUAL SHARED AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+        # The libgav1 shared object uses the static libgav1 as input to turn it
+        # into a shared object. Include everything from the static library in
+        # the shared object.
+        if(APPLE)
+          list(INSERT lib_LIB_DEPS 0 -Wl,-force_load)
+        else()
+          list(INSERT lib_LIB_DEPS 0 -Wl,--whole-archive)
+          list(APPEND lib_LIB_DEPS -Wl,--no-whole-archive)
+        endif()
+      endif()
+    endif()
+    target_link_libraries(${lib_NAME} ${link_type} ${lib_LIB_DEPS})
+  endif()
+
+  if(NOT MSVC AND lib_NAME MATCHES "^lib")
+    # Non-MSVC generators prepend lib to static lib target file names. Libgav1
+    # already includes lib in its name. Avoid naming output files liblib*.
+    set_target_properties(${lib_NAME} PROPERTIES PREFIX "")
+  endif()
+
+  if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
+    set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION})
+  endif()
+
+  if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+    if(lib_TYPE STREQUAL SHARED)
+      target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=1")
+    else()
+      target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+    endif()
+  endif()
+
+  # Determine if $lib_NAME is a header only target.
+  set(sources_list ${lib_SOURCES})
+  list(FILTER sources_list INCLUDE REGEX cc$)
+  if(NOT sources_list)
+    if(NOT XCODE)
+      # This is a header only target. Tell CMake the link language.
+      set_target_properties(${lib_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+    else()
+      # The Xcode generator ignores LINKER_LANGUAGE. Add a dummy cc file.
+      libgav1_create_dummy_source_file(TARGET ${lib_NAME} BASENAME ${lib_NAME})
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_variables.cmake b/cmake/libgav1_variables.cmake
new file mode 100644
index 0000000..0dd0f37
--- /dev/null
+++ b/cmake/libgav1_variables.cmake
@@ -0,0 +1,78 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ 1)
+
+# Halts generation when $variable_name does not refer to a directory that
+# exists.
+macro(libgav1_variable_must_be_directory variable_name)
+  if("${variable_name}" STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Empty variable_name passed to libgav1_variable_must_be_directory.")
+  endif()
+
+  if("${${variable_name}}" STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Empty variable ${variable_name} is required to build libgav1.")
+  endif()
+
+  if(NOT IS_DIRECTORY "${${variable_name}}")
+    message(
+      FATAL_ERROR
+        "${variable_name}, which is ${${variable_name}}, does not refer to a\n"
+        "directory.")
+  endif()
+endmacro()
+
+# Adds $var_name to the tracked variables list.
+macro(libgav1_track_configuration_variable var_name)
+  if(LIBGAV1_VERBOSE GREATER 2)
+    message("---- libgav1_track_configuration_variable ----\n"
+            "var_name=${var_name}\n"
+            "----------------------------------------------\n")
+  endif()
+
+  list(APPEND libgav1_configuration_variables ${var_name})
+  list(REMOVE_DUPLICATES libgav1_configuration_variables)
+endmacro()
+
+# Logs current C++ and executable linker flags via CMake's message command.
+macro(libgav1_dump_cmake_flag_variables)
+  unset(flag_variables)
+  list(APPEND flag_variables "CMAKE_CXX_FLAGS_INIT" "CMAKE_CXX_FLAGS"
+              "CMAKE_EXE_LINKER_FLAGS_INIT" "CMAKE_EXE_LINKER_FLAGS")
+  if(CMAKE_BUILD_TYPE)
+    list(APPEND flag_variables "CMAKE_BUILD_TYPE"
+                "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+                "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}"
+                "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+                "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}")
+  endif()
+  foreach(flag_variable ${flag_variables})
+    message("${flag_variable}:${${flag_variable}}")
+  endforeach()
+endmacro()
+
+# Dumps the variables tracked in $libgav1_configuration_variables via CMake's
+# message command.
+macro(libgav1_dump_tracked_configuration_variables)
+  foreach(config_variable ${libgav1_configuration_variables})
+    message("${config_variable}:${${config_variable}}")
+  endforeach()
+endmacro()
diff --git a/cmake/toolchains/aarch64-linux-gnu.cmake b/cmake/toolchains/aarch64-linux-gnu.cmake
new file mode 100644
index 0000000..7ffe397
--- /dev/null
+++ b/cmake/toolchains/aarch64-linux-gnu.cmake
@@ -0,0 +1,28 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS aarch64-linux-gnu-)
+endif()
+
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
diff --git a/cmake/toolchains/android.cmake b/cmake/toolchains/android.cmake
new file mode 100644
index 0000000..492957b
--- /dev/null
+++ b/cmake/toolchains/android.cmake
@@ -0,0 +1,53 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+
+# Additional ANDROID_* settings are available, see:
+# https://developer.android.com/ndk/guides/cmake#variables
+
+if(NOT ANDROID_PLATFORM)
+  set(ANDROID_PLATFORM android-21)
+endif()
+
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
+if(NOT ANDROID_ABI)
+  set(ANDROID_ABI arm64-v8a)
+endif()
+
+# Force arm mode for 32-bit targets (instead of the default thumb) to improve
+# performance.
+if(NOT ANDROID_ARM_MODE)
+  set(ANDROID_ARM_MODE arm)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(LIBGAV1_ANDROID_NDK_PATH)
+  set(ENV{LIBGAV1_ANDROID_NDK_PATH} "${LIBGAV1_ANDROID_NDK_PATH}")
+else()
+  set(LIBGAV1_ANDROID_NDK_PATH "$ENV{LIBGAV1_ANDROID_NDK_PATH}")
+endif()
+
+if(NOT LIBGAV1_ANDROID_NDK_PATH)
+  message(FATAL_ERROR "LIBGAV1_ANDROID_NDK_PATH not set.")
+  return()
+endif()
+
+include("${LIBGAV1_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake
new file mode 100644
index 0000000..8051f0d
--- /dev/null
+++ b/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -0,0 +1,29 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS arm-linux-gnueabihf-)
+endif()
+
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm")
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/codereview.settings b/codereview.settings
new file mode 100644
index 0000000..ccba2ee
--- /dev/null
+++ b/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: chromium-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/examples/file_reader.cc b/examples/file_reader.cc
new file mode 100644
index 0000000..b096722
--- /dev/null
+++ b/examples/file_reader.cc
@@ -0,0 +1,186 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <new>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/ivf_parser.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+}  // namespace
+
+bool FileReader::registered_in_factory_ =
+    FileReaderFactory::RegisterReader(FileReader::Open);
+
+FileReader::~FileReader() {
+  if (owns_file_) fclose(file_);
+}
+
+std::unique_ptr<FileReaderInterface> FileReader::Open(
+    const std::string& file_name, const bool error_tolerant) {
+  if (file_name.empty()) return nullptr;
+
+  FILE* raw_file_ptr;
+
+  bool owns_file = true;
+  if (file_name == "-") {
+    raw_file_ptr = SetBinaryMode(stdin);
+    owns_file = false;  // stdin is owned by the Standard C Library.
+  } else {
+    raw_file_ptr = fopen(file_name.c_str(), "rb");
+  }
+
+  if (raw_file_ptr == nullptr) {
+    return nullptr;
+  }
+
+  std::unique_ptr<FileReader> file(
+      new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
+  if (file == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+    if (owns_file) fclose(raw_file_ptr);
+    return nullptr;
+  }
+
+  if (!file->ReadIvfFileHeader()) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported file type");
+    return nullptr;
+  }
+
+  return file;
+}
+
+// IVF Frame Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3    size of frame in bytes (not including the 12-byte header)
+// bytes 4-11   64-bit presentation timestamp
+// bytes 12..   frame data
+bool FileReader::ReadTemporalUnit(std::vector<uint8_t>* const tu_data,
+                                  int64_t* const timestamp) {
+  if (tu_data == nullptr) return false;
+  tu_data->clear();
+
+  uint8_t header_buffer[kIvfFrameHeaderSize];
+  const size_t num_read = fread(header_buffer, 1, kIvfFrameHeaderSize, file_);
+
+  if (IsEndOfFile()) {
+    if (num_read != 0) {
+      LIBGAV1_EXAMPLES_LOG_ERROR(
+          "Cannot read IVF frame header: Not enough data available");
+      return false;
+    }
+
+    return true;
+  }
+
+  IvfFrameHeader ivf_frame_header;
+  if (!ParseIvfFrameHeader(header_buffer, &ivf_frame_header)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF frame header");
+    if (error_tolerant_) {
+      ivf_frame_header.frame_size =
+          std::min(ivf_frame_header.frame_size, size_t{kMaxTemporalUnitSize});
+    } else {
+      return false;
+    }
+  }
+
+  if (timestamp != nullptr) *timestamp = ivf_frame_header.timestamp;
+
+  tu_data->resize(ivf_frame_header.frame_size);
+  const size_t size_read =
+      fread(tu_data->data(), 1, ivf_frame_header.frame_size, file_);
+  if (size_read != ivf_frame_header.frame_size) {
+    LIBGAV1_EXAMPLES_LOG_ERROR(
+        "Unexpected EOF or I/O error reading frame data");
+    if (error_tolerant_) {
+      tu_data->resize(size_read);
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Attempt to read an IVF file header. Returns true for success, and false for
+// failure.
+//
+// IVF File Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3    signature: 'DKIF'
+// bytes 4-5    version (should be 0)
+// bytes 6-7    length of header in bytes
+// bytes 8-11   codec FourCC (e.g., 'VP80')
+// bytes 12-13  width in pixels
+// bytes 14-15  height in pixels
+// bytes 16-19  frame rate
+// bytes 20-23  time scale
+// bytes 24-27  number of frames in file
+// bytes 28-31  unused
+//
+// Note: The rate and scale fields correspond to the numerator and denominator
+// of frame rate (fps) or time base (the reciprocal of frame rate) as follows:
+//
+// bytes 16-19  frame rate  timebase.den  framerate.numerator
+// bytes 20-23  time scale  timebase.num  framerate.denominator
+bool FileReader::ReadIvfFileHeader() {
+  uint8_t header_buffer[kIvfFileHeaderSize];
+  const size_t num_read = fread(header_buffer, 1, kIvfFileHeaderSize, file_);
+  if (num_read != kIvfFileHeaderSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR(
+        "Cannot read IVF header: Not enough data available");
+    return false;
+  }
+
+  IvfFileHeader ivf_file_header;
+  if (!ParseIvfFileHeader(header_buffer, &ivf_file_header)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF file header");
+    if (error_tolerant_) {
+      ivf_file_header = {};
+    } else {
+      return false;
+    }
+  }
+
+  width_ = ivf_file_header.width;
+  height_ = ivf_file_header.height;
+  frame_rate_ = ivf_file_header.frame_rate_numerator;
+  time_scale_ = ivf_file_header.frame_rate_denominator;
+  type_ = kFileTypeIvf;
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/examples/file_reader.h b/examples/file_reader.h
new file mode 100644
index 0000000..c342a20
--- /dev/null
+++ b/examples/file_reader.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+// Temporal Unit based file reader class. Currently supports only IVF files.
+class FileReader : public FileReaderInterface {
+ public:
+  enum FileType {
+    kFileTypeUnknown,
+    kFileTypeIvf,
+  };
+
+  // Creates and returns a FileReader that reads from |file_name|.
+  // If |error_tolerant| is true format and read errors are ignored,
+  // ReadTemporalUnit() may return truncated data.
+  // Returns nullptr when the file does not exist, cannot be read, or is not an
+  // IVF file.
+  static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
+                                                   bool error_tolerant = false);
+
+  FileReader() = delete;
+  FileReader(const FileReader&) = delete;
+  FileReader& operator=(const FileReader&) = delete;
+
+  // Closes |file_|.
+  ~FileReader() override;
+
+  // Reads a temporal unit from |file_| and writes the data to |tu_data|.
+  // Returns true when:
+  // - A temporal unit is read successfully, or
+  // - At end of file.
+  // When ReadTemporalUnit() is called at the end of the file, it will return
+  // true without writing any data to |tu_data|.
+  //
+  // The |timestamp| pointer is optional: callers not interested in timestamps
+  // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+  // the presentation timestamp from the IVF frame header.
+  /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
+
+  /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
+    return feof(file_) != 0;
+  }
+
+  // The values returned by these accessors are strictly informative. No
+  // validation is performed when they are read from the IVF file header.
+  size_t width() const override { return width_; }
+  size_t height() const override { return height_; }
+  size_t frame_rate() const override { return frame_rate_; }
+  size_t time_scale() const override { return time_scale_; }
+
+ private:
+  FileReader(FILE* file, bool owns_file, bool error_tolerant)
+      : file_(file), owns_file_(owns_file), error_tolerant_(error_tolerant) {}
+
+  bool ReadIvfFileHeader();
+
+  FILE* file_ = nullptr;
+  size_t width_ = 0;
+  size_t height_ = 0;
+  size_t frame_rate_ = 0;
+  size_t time_scale_ = 0;
+  FileType type_ = kFileTypeUnknown;
+  // True if this object owns file_ and is responsible for closing it when
+  // done.
+  const bool owns_file_;
+  const bool error_tolerant_;
+
+  static bool registered_in_factory_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_H_
diff --git a/examples/file_reader_constants.cc b/examples/file_reader_constants.cc
new file mode 100644
index 0000000..8439071
--- /dev/null
+++ b/examples/file_reader_constants.cc
@@ -0,0 +1,23 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_constants.h"
+
+namespace libgav1 {
+
+const char kIvfSignature[4] = {'D', 'K', 'I', 'F'};
+const char kAv1FourCcUpper[4] = {'A', 'V', '0', '1'};
+const char kAv1FourCcLower[4] = {'a', 'v', '0', '1'};
+
+}  // namespace libgav1
diff --git a/examples/file_reader_constants.h b/examples/file_reader_constants.h
new file mode 100644
index 0000000..00922b4
--- /dev/null
+++ b/examples/file_reader_constants.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+
+namespace libgav1 {
+
+enum {
+  kIvfHeaderVersion = 0,
+  kIvfFrameHeaderSize = 12,
+  kIvfFileHeaderSize = 32,
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  kMaxTemporalUnitSize = 512 * 1024,
+#else
+  kMaxTemporalUnitSize = 256 * 1024 * 1024,
+#endif
+};
+
+extern const char kIvfSignature[4];
+extern const char kAv1FourCcUpper[4];
+extern const char kAv1FourCcLower[4];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
diff --git a/examples/file_reader_factory.cc b/examples/file_reader_factory.cc
new file mode 100644
index 0000000..d5260eb
--- /dev/null
+++ b/examples/file_reader_factory.cc
@@ -0,0 +1,51 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <new>
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+std::vector<FileReaderFactory::OpenFunction>* GetFileReaderOpenFunctions() {
+  static auto* open_functions =
+      new (std::nothrow) std::vector<FileReaderFactory::OpenFunction>();
+  return open_functions;
+}
+
+}  // namespace
+
+bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
+  if (open_function == nullptr) return false;
+  auto* open_functions = GetFileReaderOpenFunctions();
+  const size_t num_readers = open_functions->size();
+  open_functions->push_back(open_function);
+  return open_functions->size() == num_readers + 1;
+}
+
+std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
+    const std::string& file_name, const bool error_tolerant /*= false*/) {
+  for (auto* open_function : *GetFileReaderOpenFunctions()) {
+    auto reader = open_function(file_name, error_tolerant);
+    if (reader == nullptr) continue;
+    return reader;
+  }
+  LIBGAV1_EXAMPLES_LOG_ERROR("No file reader able to open input");
+  return nullptr;
+}
+
+}  // namespace libgav1
diff --git a/examples/file_reader_factory.h b/examples/file_reader_factory.h
new file mode 100644
index 0000000..0f53484
--- /dev/null
+++ b/examples/file_reader_factory.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+
+#include <memory>
+#include <string>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+class FileReaderFactory {
+ public:
+  using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
+      const std::string& file_name, bool error_tolerant);
+
+  FileReaderFactory() = delete;
+  FileReaderFactory(const FileReaderFactory&) = delete;
+  FileReaderFactory& operator=(const FileReaderFactory&) = delete;
+  ~FileReaderFactory() = default;
+
+  // Registers the OpenFunction for a FileReaderInterface and returns true when
+  // registration succeeds.
+  static bool RegisterReader(OpenFunction open_function);
+
+  // Passes |file_name| to each OpenFunction until one succeeds. Returns nullptr
+  // when no reader is found for |file_name|. Otherwise a FileReaderInterface is
+  // returned. If |error_tolerant| is true and the reader supports it, some
+  // format and read errors may be ignored and partial data returned.
+  static std::unique_ptr<FileReaderInterface> OpenReader(
+      const std::string& file_name, bool error_tolerant = false);
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
diff --git a/examples/file_reader_interface.h b/examples/file_reader_interface.h
new file mode 100644
index 0000000..d8f7030
--- /dev/null
+++ b/examples/file_reader_interface.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace libgav1 {
+
+class FileReaderInterface {
+ public:
+  FileReaderInterface() = default;
+  FileReaderInterface(const FileReaderInterface&) = delete;
+  FileReaderInterface& operator=(const FileReaderInterface&) = delete;
+
+  FileReaderInterface(FileReaderInterface&&) = default;
+  FileReaderInterface& operator=(FileReaderInterface&&) = default;
+
+  // Closes the file.
+  virtual ~FileReaderInterface() = default;
+
+  // Reads a temporal unit from the file and writes the data to |tu_data|.
+  // Returns true when:
+  // - A temporal unit is read successfully, or
+  // - At end of file.
+  // When ReadTemporalUnit() is called at the end of the file, it will return
+  // true without writing any data to |tu_data|.
+  //
+  // The |timestamp| pointer is optional: callers not interested in timestamps
+  // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+  // the presentation timestamp of the temporal unit.
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
+
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
+
+  // The values returned by these accessors are strictly informative. No
+  // validation is performed when they are read from file.
+  virtual size_t width() const = 0;
+  virtual size_t height() const = 0;
+  virtual size_t frame_rate() const = 0;
+  virtual size_t time_scale() const = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
diff --git a/examples/file_writer.cc b/examples/file_writer.cc
new file mode 100644
index 0000000..54afe14
--- /dev/null
+++ b/examples/file_writer.cc
@@ -0,0 +1,183 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <string>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+std::string GetY4mColorSpaceString(
+    const FileWriter::Y4mParameters& y4m_parameters) {
+  std::string color_space_string;
+  switch (y4m_parameters.image_format) {
+    case kImageFormatMonochrome400:
+      color_space_string = "mono";
+      break;
+    case kImageFormatYuv420:
+      if (y4m_parameters.bitdepth == 8) {
+        if (y4m_parameters.chroma_sample_position ==
+            kChromaSamplePositionVertical) {
+          color_space_string = "420mpeg2";
+        } else if (y4m_parameters.chroma_sample_position ==
+                   kChromaSamplePositionColocated) {
+          color_space_string = "420";
+        } else {
+          color_space_string = "420jpeg";
+        }
+      } else {
+        color_space_string = "420";
+      }
+      break;
+    case kImageFormatYuv422:
+      color_space_string = "422";
+      break;
+    case kImageFormatYuv444:
+      color_space_string = "444";
+      break;
+  }
+
+  if (y4m_parameters.bitdepth > 8) {
+    const bool monochrome =
+        y4m_parameters.image_format == kImageFormatMonochrome400;
+    if (!monochrome) color_space_string += "p";
+    color_space_string += std::to_string(y4m_parameters.bitdepth);
+  }
+
+  return color_space_string;
+}
+
+}  // namespace
+
+FileWriter::~FileWriter() { fclose(file_); }
+
+std::unique_ptr<FileWriter> FileWriter::Open(
+    const std::string& file_name, FileType file_type,
+    const Y4mParameters* const y4m_parameters) {
+  if (file_name.empty() ||
+      (file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
+      (file_type != kFileTypeRaw && file_type != kFileTypeY4m)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Invalid parameters");
+    return nullptr;
+  }
+
+  FILE* raw_file_ptr;
+
+  if (file_name == "-") {
+    raw_file_ptr = SetBinaryMode(stdout);
+  } else {
+    raw_file_ptr = fopen(file_name.c_str(), "wb");
+  }
+
+  if (raw_file_ptr == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unable to open output file");
+    return nullptr;
+  }
+
+  std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
+  if (file == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+    fclose(raw_file_ptr);
+    return nullptr;
+  }
+
+  if (file_type == kFileTypeY4m && !file->WriteY4mFileHeader(*y4m_parameters)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M file header");
+    return nullptr;
+  }
+
+  file->file_type_ = file_type;
+  return file;
+}
+
+bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
+  if (file_type_ == kFileTypeY4m) {
+    const char kY4mFrameHeader[] = "FRAME\n";
+    if (fwrite(kY4mFrameHeader, 1, strlen(kY4mFrameHeader), file_) !=
+        strlen(kY4mFrameHeader)) {
+      LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M frame header");
+      return false;
+    }
+  }
+
+  const size_t pixel_size =
+      (frame_buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  for (int plane_index = 0; plane_index < frame_buffer.NumPlanes();
+       ++plane_index) {
+    const int height = frame_buffer.displayed_height[plane_index];
+    const int width = frame_buffer.displayed_width[plane_index];
+    const int stride = frame_buffer.stride[plane_index];
+    const uint8_t* const plane_pointer = frame_buffer.plane[plane_index];
+    for (int row = 0; row < height; ++row) {
+      const uint8_t* const row_pointer = &plane_pointer[row * stride];
+      if (fwrite(row_pointer, pixel_size, width, file_) !=
+          static_cast<size_t>(width)) {
+        char error_string[256];
+        snprintf(error_string, sizeof(error_string),
+                 "File write failed: %s (errno=%d)", strerror(errno), errno);
+        LIBGAV1_EXAMPLES_LOG_ERROR(error_string);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Writes Y4M file header to |file_| and returns true when successful.
+//
+// A Y4M file begins with a plaintext file signature of 'YUV4MPEG2 '.
+//
+// Following the signature is any number of optional parameters preceded by a
+// space. We always write:
+//
+// Width: 'W' followed by image width in pixels.
+// Height: 'H' followed by image height in pixels.
+// Frame Rate: 'F' followed frames/second in the form numerator:denominator.
+// Interlacing: 'I' followed by 'p' for progressive.
+// Color space: 'C' followed by a string representation of the color space.
+//
+// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
+bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
+  std::string y4m_header = "YUV4MPEG2";
+  y4m_header += " W" + std::to_string(y4m_parameters.width);
+  y4m_header += " H" + std::to_string(y4m_parameters.height);
+  y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+                ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+  y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+  y4m_header += "\n";
+  return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
+         y4m_header.length();
+}
+
+}  // namespace libgav1
diff --git a/examples/file_writer.h b/examples/file_writer.h
new file mode 100644
index 0000000..00f6cc3
--- /dev/null
+++ b/examples/file_writer.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_WRITER_H_
+#define LIBGAV1_EXAMPLES_FILE_WRITER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// Frame based file writer class. Supports only Y4M (YUV4MPEG2) and RAW output.
+class FileWriter {
+ public:
+  enum FileType : uint8_t {
+    kFileTypeRaw,
+    kFileTypeY4m,
+  };
+
+  struct Y4mParameters {
+    Y4mParameters() = default;
+    Y4mParameters(size_t width, size_t height, size_t frame_rate_numerator,
+                  size_t frame_rate_denominator,
+                  ChromaSamplePosition chroma_sample_position,
+                  ImageFormat image_format, size_t bitdepth)
+        : width(width),
+          height(height),
+          frame_rate_numerator(frame_rate_numerator),
+          frame_rate_denominator(frame_rate_denominator),
+          chroma_sample_position(chroma_sample_position),
+          image_format(image_format),
+          bitdepth(bitdepth) {}
+
+    Y4mParameters(const Y4mParameters& rhs) = default;
+    Y4mParameters& operator=(const Y4mParameters& rhs) = default;
+    Y4mParameters(Y4mParameters&& rhs) = default;
+    Y4mParameters& operator=(Y4mParameters&& rhs) = default;
+
+    size_t width = 0;
+    size_t height = 0;
+    size_t frame_rate_numerator = 30;
+    size_t frame_rate_denominator = 1;
+    ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+    ImageFormat image_format = kImageFormatYuv420;
+    size_t bitdepth = 8;
+  };
+
+  // Opens |file_name|. When |file_type| is kFileTypeY4m the Y4M file header is
+  // written out to |file_| before this method returns.
+  //
+  // Returns a FileWriter instance after the file is opened successfully for
+  // kFileTypeRaw files, and after the Y4M file header bytes are written for
+  // kFileTypeY4m files. Returns nullptr upon failure.
+  static std::unique_ptr<FileWriter> Open(const std::string& file_name,
+                                          FileType type,
+                                          const Y4mParameters* y4m_parameters);
+
+  FileWriter() = delete;
+  FileWriter(const FileWriter&) = delete;
+  FileWriter& operator=(const FileWriter&) = delete;
+
+  FileWriter(FileWriter&&) = default;
+  FileWriter& operator=(FileWriter&&) = default;
+
+  // Closes |file_|.
+  ~FileWriter();
+
+  // Writes the frame data in |frame_buffer| to |file_|. Returns true after
+  // successful write of |frame_buffer| data.
+  /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+      const DecoderBuffer& frame_buffer);
+
+ private:
+  explicit FileWriter(FILE* file) : file_(file) {}
+
+  bool WriteY4mFileHeader(const Y4mParameters& y4m_parameters);
+
+  FILE* file_ = nullptr;
+  FileType file_type_ = kFileTypeRaw;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_WRITER_H_
diff --git a/examples/gav1_decode.cc b/examples/gav1_decode.cc
new file mode 100644
index 0000000..4de0ba2
--- /dev/null
+++ b/examples/gav1_decode.cc
@@ -0,0 +1,452 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/file_writer.h"
+#include "gav1/decoder.h"
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+#endif
+
+namespace {
+
+struct Options {
+  const char* input_file_name = nullptr;
+  const char* output_file_name = nullptr;
+  const char* frame_timing_file_name = nullptr;
+  libgav1::FileWriter::FileType output_file_type =
+      libgav1::FileWriter::kFileTypeRaw;
+  uint8_t post_filter_mask = 0x1f;
+  int threads = 1;
+  bool frame_parallel = false;
+  bool output_all_layers = false;
+  int operating_point = 0;
+  int limit = 0;
+  int skip = 0;
+  int verbose = 0;
+};
+
+struct Timing {
+  absl::Duration input;
+  absl::Duration dequeue;
+};
+
+struct FrameTiming {
+  absl::Time enqueue;
+  absl::Time dequeue;
+};
+
+void PrintHelp(FILE* const fout) {
+  fprintf(fout,
+          "Usage: gav1_decode [options] <input file>"
+          " [-o <output file>]\n");
+  fprintf(fout, "\n");
+  fprintf(fout, "Options:\n");
+  fprintf(fout, "  -h, --help This help message.\n");
+  fprintf(fout, "  --threads <positive integer> (Default 1).\n");
+  fprintf(fout, "  --frame_parallel.\n");
+  fprintf(fout,
+          "  --limit <integer> Stop decoding after N frames (0 = all).\n");
+  fprintf(fout, "  --skip <integer> Skip initial N frames (Default 0).\n");
+  fprintf(fout, "  --version.\n");
+  fprintf(fout, "  --y4m (Default false).\n");
+  fprintf(fout, "  --raw (Default true).\n");
+  fprintf(fout, "  -v logging verbosity, can be used multiple times.\n");
+  fprintf(fout, "  --all_layers.\n");
+  fprintf(fout,
+          "  --operating_point <integer between 0 and 31> (Default 0).\n");
+  fprintf(fout,
+          "  --frame_timing <file> Output per-frame timing to <file> in tsv"
+          " format.\n   Yields meaningful results only when frame parallel is"
+          " off.\n");
+  fprintf(fout, "\nAdvanced settings:\n");
+  fprintf(fout, "  --post_filter_mask <integer> (Default 0x1f).\n");
+  fprintf(fout,
+          "   Mask indicating which post filters should be applied to the"
+          " reconstructed\n   frame. This may be given as octal, decimal or"
+          " hexadecimal. From LSB:\n");
+  fprintf(fout, "     Bit 0: Loop filter (deblocking filter)\n");
+  fprintf(fout, "     Bit 1: Cdef\n");
+  fprintf(fout, "     Bit 2: SuperRes\n");
+  fprintf(fout, "     Bit 3: Loop Restoration\n");
+  fprintf(fout, "     Bit 4: Film Grain Synthesis\n");
+}
+
+void ParseOptions(int argc, char* argv[], Options* const options) {
+  for (int i = 1; i < argc; ++i) {
+    int32_t value;
+    if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+      PrintHelp(stdout);
+      exit(EXIT_SUCCESS);
+    } else if (strcmp(argv[i], "-o") == 0) {
+      if (++i >= argc) {
+        fprintf(stderr, "Missing argument for '-o'\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->output_file_name = argv[i];
+    } else if (strcmp(argv[i], "--frame_timing") == 0) {
+      if (++i >= argc) {
+        fprintf(stderr, "Missing argument for '--frame_timing'\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->frame_timing_file_name = argv[i];
+    } else if (strcmp(argv[i], "--version") == 0) {
+      printf("gav1_decode, a libgav1 based AV1 decoder\n");
+      printf("libgav1 %s\n", libgav1::GetVersionString());
+      printf("max bitdepth: %d\n", libgav1::Decoder::GetMaxBitdepth());
+      printf("build configuration: %s\n", libgav1::GetBuildConfiguration());
+      exit(EXIT_SUCCESS);
+    } else if (strcmp(argv[i], "-v") == 0) {
+      ++options->verbose;
+    } else if (strcmp(argv[i], "--raw") == 0) {
+      options->output_file_type = libgav1::FileWriter::kFileTypeRaw;
+    } else if (strcmp(argv[i], "--y4m") == 0) {
+      options->output_file_type = libgav1::FileWriter::kFileTypeY4m;
+    } else if (strcmp(argv[i], "--threads") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value)) {
+        fprintf(stderr, "Missing/Invalid value for --threads.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->threads = value;
+    } else if (strcmp(argv[i], "--frame_parallel") == 0) {
+      options->frame_parallel = true;
+    } else if (strcmp(argv[i], "--all_layers") == 0) {
+      options->output_all_layers = true;
+    } else if (strcmp(argv[i], "--operating_point") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0 ||
+          value >= 32) {
+        fprintf(stderr, "Missing/Invalid value for --operating_point.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->operating_point = value;
+    } else if (strcmp(argv[i], "--limit") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+        fprintf(stderr, "Missing/Invalid value for --limit.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->limit = value;
+    } else if (strcmp(argv[i], "--skip") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+        fprintf(stderr, "Missing/Invalid value for --skip.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->skip = value;
+    } else if (strcmp(argv[i], "--post_filter_mask") == 0) {
+      errno = 0;
+      char* endptr = nullptr;
+      value = (++i >= argc) ? -1
+                            // NOLINTNEXTLINE(runtime/deprecated_fn)
+                            : static_cast<int32_t>(strtol(argv[i], &endptr, 0));
+      // Only the last 5 bits of the mask can be set.
+      if ((value & ~31) != 0 || errno != 0 || endptr == argv[i]) {
+        fprintf(stderr, "Invalid value for --post_filter_mask.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->post_filter_mask = value;
+    } else if (strlen(argv[i]) > 1 && argv[i][0] == '-') {
+      fprintf(stderr, "Unknown option '%s'!\n", argv[i]);
+      exit(EXIT_FAILURE);
+    } else {
+      if (options->input_file_name == nullptr) {
+        options->input_file_name = argv[i];
+      } else {
+        fprintf(stderr, "Found invalid parameter: \"%s\".\n", argv[i]);
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  if (argc < 2 || options->input_file_name == nullptr) {
+    fprintf(stderr, "Input file is required!\n");
+    PrintHelp(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+using InputBuffer = std::vector<uint8_t>;
+
+class InputBuffers {
+ public:
+  ~InputBuffers() {
+    for (auto buffer : free_buffers_) {
+      delete buffer;
+    }
+  }
+  InputBuffer* GetFreeBuffer() {
+    if (free_buffers_.empty()) {
+      auto* const buffer = new (std::nothrow) InputBuffer();
+      if (buffer == nullptr) {
+        fprintf(stderr, "Failed to create input buffer.\n");
+        return nullptr;
+      }
+      free_buffers_.push_back(buffer);
+    }
+    InputBuffer* const buffer = free_buffers_.front();
+    free_buffers_.pop_front();
+    return buffer;
+  }
+
+  void ReleaseInputBuffer(InputBuffer* buffer) {
+    free_buffers_.push_back(buffer);
+  }
+
+ private:
+  std::deque<InputBuffer*> free_buffers_;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+                        void* buffer_private_data) {
+  auto* const input_buffers = static_cast<InputBuffers*>(callback_private_data);
+  input_buffers->ReleaseInputBuffer(
+      static_cast<InputBuffer*>(buffer_private_data));
+}
+
+int CloseFile(FILE* stream) { return (stream == nullptr) ? 0 : fclose(stream); }
+
+}  // namespace
+
+int main(int argc, char* argv[]) {
+  Options options;
+  ParseOptions(argc, argv, &options);
+
+  auto file_reader =
+      libgav1::FileReaderFactory::OpenReader(options.input_file_name);
+  if (file_reader == nullptr) {
+    fprintf(stderr, "Cannot open input file!\n");
+    return EXIT_FAILURE;
+  }
+
+  std::unique_ptr<FILE, decltype(&CloseFile)> frame_timing_file(nullptr,
+                                                                &CloseFile);
+  if (options.frame_timing_file_name != nullptr) {
+    frame_timing_file.reset(fopen(options.frame_timing_file_name, "wb"));
+    if (frame_timing_file == nullptr) {
+      fprintf(stderr, "Cannot open frame timing file '%s'!\n",
+              options.frame_timing_file_name);
+      return EXIT_FAILURE;
+    }
+  }
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+  // Reference frames + 1 scratch frame (for either the current frame or the
+  // film grain frame).
+  constexpr int kNumBuffers = 8 + 1;
+  std::unique_ptr<Gav1DecodeCVPixelBufferPool> cv_pixel_buffers =
+      Gav1DecodeCVPixelBufferPool::Create(kNumBuffers);
+  if (cv_pixel_buffers == nullptr) {
+    fprintf(stderr, "Cannot create Gav1DecodeCVPixelBufferPool!\n");
+    return EXIT_FAILURE;
+  }
+#endif
+
+  InputBuffers input_buffers;
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings;
+  settings.post_filter_mask = options.post_filter_mask;
+  settings.threads = options.threads;
+  settings.frame_parallel = options.frame_parallel;
+  settings.output_all_layers = options.output_all_layers;
+  settings.operating_point = options.operating_point;
+  settings.blocking_dequeue = true;
+  settings.callback_private_data = &input_buffers;
+  settings.release_input_buffer = ReleaseInputBuffer;
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+  settings.on_frame_buffer_size_changed = Gav1DecodeOnCVPixelBufferSizeChanged;
+  settings.get_frame_buffer = Gav1DecodeGetCVPixelBuffer;
+  settings.release_frame_buffer = Gav1DecodeReleaseCVPixelBuffer;
+  settings.callback_private_data = cv_pixel_buffers.get();
+  settings.release_input_buffer = nullptr;
+  // TODO(vigneshv): Support frame parallel mode to be used with
+  // CVPixelBufferPool.
+  settings.frame_parallel = false;
+#endif
+  libgav1::StatusCode status = decoder.Init(&settings);
+  if (status != libgav1::kStatusOk) {
+    fprintf(stderr, "Error initializing decoder: %s\n",
+            libgav1::GetErrorString(status));
+    return EXIT_FAILURE;
+  }
+
+  fprintf(stderr, "decoding '%s'\n", options.input_file_name);
+  if (options.verbose > 0 && options.skip > 0) {
+    fprintf(stderr, "skipping %d frame(s).\n", options.skip);
+  }
+
+  int input_frames = 0;
+  int decoded_frames = 0;
+  Timing timing = {};
+  std::vector<FrameTiming> frame_timing;
+  const bool record_frame_timing = frame_timing_file != nullptr;
+  std::unique_ptr<libgav1::FileWriter> file_writer;
+  InputBuffer* input_buffer = nullptr;
+  bool limit_reached = false;
+  bool dequeue_finished = false;
+  const absl::Time decode_loop_start = absl::Now();
+  do {
+    if (input_buffer == nullptr && !file_reader->IsEndOfFile() &&
+        !limit_reached) {
+      input_buffer = input_buffers.GetFreeBuffer();
+      if (input_buffer == nullptr) return EXIT_FAILURE;
+      const absl::Time read_start = absl::Now();
+      if (!file_reader->ReadTemporalUnit(input_buffer,
+                                         /*timestamp=*/nullptr)) {
+        fprintf(stderr, "Error reading input file.\n");
+        return EXIT_FAILURE;
+      }
+      timing.input += absl::Now() - read_start;
+    }
+
+    if (++input_frames <= options.skip) {
+      input_buffers.ReleaseInputBuffer(input_buffer);
+      input_buffer = nullptr;
+      continue;
+    }
+
+    if (input_buffer != nullptr) {
+      if (input_buffer->empty()) {
+        input_buffers.ReleaseInputBuffer(input_buffer);
+        input_buffer = nullptr;
+        continue;
+      }
+
+      const absl::Time enqueue_start = absl::Now();
+      status = decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+                                    static_cast<int64_t>(frame_timing.size()),
+                                    /*buffer_private_data=*/input_buffer);
+      if (status == libgav1::kStatusOk) {
+        if (options.verbose > 1) {
+          fprintf(stderr, "enqueue frame (length %zu)\n", input_buffer->size());
+        }
+        if (record_frame_timing) {
+          FrameTiming enqueue_time = {enqueue_start, absl::UnixEpoch()};
+          frame_timing.emplace_back(enqueue_time);
+        }
+
+        input_buffer = nullptr;
+        // Continue to enqueue frames until we get a kStatusTryAgain status.
+        continue;
+      }
+      if (status != libgav1::kStatusTryAgain) {
+        fprintf(stderr, "Unable to enqueue frame: %s\n",
+                libgav1::GetErrorString(status));
+        return EXIT_FAILURE;
+      }
+    }
+
+    const libgav1::DecoderBuffer* buffer;
+    status = decoder.DequeueFrame(&buffer);
+    if (status == libgav1::kStatusNothingToDequeue) {
+      dequeue_finished = true;
+      continue;
+    }
+    if (status != libgav1::kStatusOk) {
+      fprintf(stderr, "Unable to dequeue frame: %s\n",
+              libgav1::GetErrorString(status));
+      return EXIT_FAILURE;
+    }
+    dequeue_finished = false;
+    if (buffer == nullptr) continue;
+    ++decoded_frames;
+    if (options.verbose > 1) {
+      fprintf(stderr, "buffer dequeued\n");
+    }
+
+    if (record_frame_timing) {
+      frame_timing[static_cast<int>(buffer->user_private_data)].dequeue =
+          absl::Now();
+    }
+
+    if (options.output_file_name != nullptr && file_writer == nullptr) {
+      libgav1::FileWriter::Y4mParameters y4m_parameters;
+      y4m_parameters.width = buffer->displayed_width[0];
+      y4m_parameters.height = buffer->displayed_height[0];
+      y4m_parameters.frame_rate_numerator = file_reader->frame_rate();
+      y4m_parameters.frame_rate_denominator = file_reader->time_scale();
+      y4m_parameters.chroma_sample_position = buffer->chroma_sample_position;
+      y4m_parameters.image_format = buffer->image_format;
+      y4m_parameters.bitdepth = static_cast<size_t>(buffer->bitdepth);
+      file_writer = libgav1::FileWriter::Open(
+          options.output_file_name, options.output_file_type, &y4m_parameters);
+      if (file_writer == nullptr) {
+        fprintf(stderr, "Cannot open output file!\n");
+        return EXIT_FAILURE;
+      }
+    }
+
+    if (!limit_reached && file_writer != nullptr &&
+        !file_writer->WriteFrame(*buffer)) {
+      fprintf(stderr, "Error writing output file.\n");
+      return EXIT_FAILURE;
+    }
+    if (options.limit > 0 && options.limit == decoded_frames) {
+      limit_reached = true;
+      if (input_buffer != nullptr) {
+        input_buffers.ReleaseInputBuffer(input_buffer);
+      }
+      input_buffer = nullptr;
+    }
+  } while (input_buffer != nullptr ||
+           (!file_reader->IsEndOfFile() && !limit_reached) ||
+           !dequeue_finished);
+  timing.dequeue = absl::Now() - decode_loop_start - timing.input;
+
+  if (record_frame_timing) {
+    // Note timing for frame parallel will be skewed by the time spent queueing
+    // additional frames and in the output queue waiting for previous frames,
+    // the values reported won't be that meaningful.
+    fprintf(frame_timing_file.get(), "frame number\tdecode time us\n");
+    for (size_t i = 0; i < frame_timing.size(); ++i) {
+      const int decode_time_us = static_cast<int>(absl::ToInt64Microseconds(
+          frame_timing[i].dequeue - frame_timing[i].enqueue));
+      fprintf(frame_timing_file.get(), "%zu\t%d\n", i, decode_time_us);
+    }
+  }
+
+  if (options.verbose > 0) {
+    fprintf(stderr, "time to read input: %d us\n",
+            static_cast<int>(absl::ToInt64Microseconds(timing.input)));
+    const int decode_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(timing.dequeue));
+    const double decode_fps =
+        (decode_time_us == 0) ? 0.0 : 1.0e6 * decoded_frames / decode_time_us;
+    fprintf(stderr, "time to decode input: %d us (%d frames, %.2f fps)\n",
+            decode_time_us, decoded_frames, decode_fps);
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.cc b/examples/gav1_decode_cv_pixel_buffer_pool.cc
new file mode 100644
index 0000000..6aa4e61
--- /dev/null
+++ b/examples/gav1_decode_cv_pixel_buffer_pool.cc
@@ -0,0 +1,278 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+namespace {
+
+struct CFTypeDeleter {
+  void operator()(CFTypeRef cf) const { CFRelease(cf); }
+};
+
+using UniqueCFNumberRef =
+    std::unique_ptr<std::remove_pointer<CFNumberRef>::type, CFTypeDeleter>;
+
+using UniqueCFDictionaryRef =
+    std::unique_ptr<std::remove_pointer<CFDictionaryRef>::type, CFTypeDeleter>;
+
+}  // namespace
+
+extern "C" {
+
+libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  return buffer_pool->OnCVPixelBufferSizeChanged(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment);
+}
+
+libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment,
+    libgav1::FrameBuffer* frame_buffer) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  return buffer_pool->GetCVPixelBuffer(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+                                    void* buffer_private_data) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  buffer_pool->ReleaseCVPixelBuffer(buffer_private_data);
+}
+
+}  // extern "C"
+
+// static
+std::unique_ptr<Gav1DecodeCVPixelBufferPool>
+Gav1DecodeCVPixelBufferPool::Create(size_t num_buffers) {
+  std::unique_ptr<Gav1DecodeCVPixelBufferPool> buffer_pool(
+      new (std::nothrow) Gav1DecodeCVPixelBufferPool(num_buffers));
+  return buffer_pool;
+}
+
+Gav1DecodeCVPixelBufferPool::Gav1DecodeCVPixelBufferPool(size_t num_buffers)
+    : num_buffers_(static_cast<int>(num_buffers)) {}
+
+Gav1DecodeCVPixelBufferPool::~Gav1DecodeCVPixelBufferPool() {
+  CVPixelBufferPoolRelease(pool_);
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::OnCVPixelBufferSizeChanged(
+    int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment) {
+  if (bitdepth != 8 || (image_format != libgav1::kImageFormatYuv420 &&
+                        image_format != libgav1::kImageFormatMonochrome400)) {
+    fprintf(stderr,
+            "Only bitdepth 8, 4:2:0 videos are supported: bitdepth %d, "
+            "image_format: %d.\n",
+            bitdepth, image_format);
+    return libgav1::kStatusUnimplemented;
+  }
+
+  // stride_alignment must be a power of 2.
+  assert((stride_alignment & (stride_alignment - 1)) == 0);
+
+  // The possible keys for CVPixelBufferPool are:
+  //   kCVPixelBufferPoolMinimumBufferCountKey
+  //   kCVPixelBufferPoolMaximumBufferAgeKey
+  //   kCVPixelBufferPoolAllocationThresholdKey
+  const void* pool_keys[] = {kCVPixelBufferPoolMinimumBufferCountKey};
+  const int min_buffer_count = 10;
+  UniqueCFNumberRef cf_min_buffer_count(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &min_buffer_count));
+  if (cf_min_buffer_count == nullptr) {
+    fprintf(stderr, "CFNumberCreate failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+  const void* pool_values[] = {cf_min_buffer_count.get()};
+  UniqueCFDictionaryRef pool_attributes(CFDictionaryCreate(
+      nullptr, pool_keys, pool_values, 1, &kCFTypeDictionaryKeyCallBacks,
+      &kCFTypeDictionaryValueCallBacks));
+  if (pool_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+
+  // The pixelBufferAttributes argument to CVPixelBufferPoolCreate() cannot be
+  // null and must contain the pixel format, width, and height, otherwise
+  // CVPixelBufferPoolCreate() fails with kCVReturnInvalidPixelBufferAttributes
+  // (-6682).
+
+  // I420: kCVPixelFormatType_420YpCbCr8Planar (video range).
+  const int pixel_format = (image_format == libgav1::kImageFormatYuv420)
+                               ? kCVPixelFormatType_420YpCbCr8PlanarFullRange
+                               : kCVPixelFormatType_OneComponent8;
+  UniqueCFNumberRef cf_pixel_format(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &pixel_format));
+  UniqueCFNumberRef cf_width(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &width));
+  UniqueCFNumberRef cf_height(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &height));
+  UniqueCFNumberRef cf_left_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &left_border));
+  UniqueCFNumberRef cf_right_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &right_border));
+  UniqueCFNumberRef cf_top_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &top_border));
+  UniqueCFNumberRef cf_bottom_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &bottom_border));
+  UniqueCFNumberRef cf_stride_alignment(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &stride_alignment));
+
+  const void* buffer_keys[] = {
+      kCVPixelBufferPixelFormatTypeKey,
+      kCVPixelBufferWidthKey,
+      kCVPixelBufferHeightKey,
+      kCVPixelBufferExtendedPixelsLeftKey,
+      kCVPixelBufferExtendedPixelsRightKey,
+      kCVPixelBufferExtendedPixelsTopKey,
+      kCVPixelBufferExtendedPixelsBottomKey,
+      kCVPixelBufferBytesPerRowAlignmentKey,
+  };
+  const void* buffer_values[] = {
+      cf_pixel_format.get(),  cf_width.get(),
+      cf_height.get(),        cf_left_border.get(),
+      cf_right_border.get(),  cf_top_border.get(),
+      cf_bottom_border.get(), cf_stride_alignment.get(),
+  };
+  UniqueCFDictionaryRef buffer_attributes(CFDictionaryCreate(
+      kCFAllocatorDefault, buffer_keys, buffer_values, 8,
+      &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+  if (buffer_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate of buffer_attributes failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+  CVPixelBufferPoolRef cv_pool;
+  CVReturn ret = CVPixelBufferPoolCreate(
+      /*allocator=*/nullptr, pool_attributes.get(), buffer_attributes.get(),
+      &cv_pool);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "CVPixelBufferPoolCreate failed: %d.\n",
+            static_cast<int>(ret));
+    return libgav1::kStatusOutOfMemory;
+  }
+  CVPixelBufferPoolRelease(pool_);
+  pool_ = cv_pool;
+  return libgav1::kStatusOk;
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::GetCVPixelBuffer(
+    int bitdepth, libgav1::ImageFormat image_format, int /*width*/,
+    int /*height*/, int /*left_border*/, int /*right_border*/,
+    int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/,
+    libgav1::FrameBuffer* frame_buffer) {
+  static_cast<void>(bitdepth);
+  assert(bitdepth == 8 && (image_format == libgav1::kImageFormatYuv420 ||
+                           image_format == libgav1::kImageFormatMonochrome400));
+  const bool is_monochrome =
+      (image_format == libgav1::kImageFormatMonochrome400);
+
+  // The dictionary must have kCVPixelBufferPoolAllocationThresholdKey,
+  // otherwise CVPixelBufferPoolCreatePixelBufferWithAuxAttributes() fails with
+  // kCVReturnWouldExceedAllocationThreshold (-6689).
+  UniqueCFNumberRef cf_num_buffers(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &num_buffers_));
+
+  const void* buffer_keys[] = {
+      kCVPixelBufferPoolAllocationThresholdKey,
+  };
+  const void* buffer_values[] = {
+      cf_num_buffers.get(),
+  };
+  UniqueCFDictionaryRef aux_attributes(CFDictionaryCreate(
+      kCFAllocatorDefault, buffer_keys, buffer_values, 1,
+      &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+  if (aux_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate of aux_attributes failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+
+  CVPixelBufferRef pixel_buffer;
+  CVReturn ret = CVPixelBufferPoolCreatePixelBufferWithAuxAttributes(
+      /*allocator=*/nullptr, pool_, aux_attributes.get(), &pixel_buffer);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr,
+            "CVPixelBufferPoolCreatePixelBufferWithAuxAttributes failed: %d.\n",
+            static_cast<int>(ret));
+    return libgav1::kStatusOutOfMemory;
+  }
+
+  ret = CVPixelBufferLockBaseAddress(pixel_buffer, /*lockFlags=*/0);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "CVPixelBufferLockBaseAddress failed: %d.\n",
+            static_cast<int>(ret));
+    CFRelease(pixel_buffer);
+    return libgav1::kStatusUnknownError;
+  }
+
+  // If the pixel format type is kCVPixelFormatType_OneComponent8, the pixel
+  // buffer is nonplanar (CVPixelBufferIsPlanar returns false and
+  // CVPixelBufferGetPlaneCount returns 0), but
+  // CVPixelBufferGetBytesPerRowOfPlane and CVPixelBufferGetBaseAddressOfPlane
+  // still work for plane index 0, even though the documentation says they
+  // return NULL for nonplanar pixel buffers.
+  frame_buffer->stride[0] =
+      static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 0));
+  frame_buffer->plane[0] = static_cast<uint8_t*>(
+      CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 0));
+  if (is_monochrome) {
+    frame_buffer->stride[1] = 0;
+    frame_buffer->stride[2] = 0;
+    frame_buffer->plane[1] = nullptr;
+    frame_buffer->plane[2] = nullptr;
+  } else {
+    frame_buffer->stride[1] =
+        static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 1));
+    frame_buffer->stride[2] =
+        static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 2));
+    frame_buffer->plane[1] = static_cast<uint8_t*>(
+        CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 1));
+    frame_buffer->plane[2] = static_cast<uint8_t*>(
+        CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 2));
+  }
+  frame_buffer->private_data = pixel_buffer;
+
+  return libgav1::kStatusOk;
+}
+
+void Gav1DecodeCVPixelBufferPool::ReleaseCVPixelBuffer(
+    void* buffer_private_data) {
+  auto const pixel_buffer = static_cast<CVPixelBufferRef>(buffer_private_data);
+  CVReturn ret =
+      CVPixelBufferUnlockBaseAddress(pixel_buffer, /*unlockFlags=*/0);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "%s:%d: CVPixelBufferUnlockBaseAddress failed: %d.\n",
+            __FILE__, __LINE__, static_cast<int>(ret));
+    abort();
+  }
+  CFRelease(pixel_buffer);
+}
diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.h b/examples/gav1_decode_cv_pixel_buffer_pool.h
new file mode 100644
index 0000000..7aee324
--- /dev/null
+++ b/examples/gav1_decode_cv_pixel_buffer_pool.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+#define LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+
+#include <CoreVideo/CoreVideo.h>
+
+#include <cstddef>
+#include <memory>
+
+#include "gav1/frame_buffer.h"
+
+extern "C" libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment);
+
+extern "C" libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment,
+    libgav1::FrameBuffer* frame_buffer);
+
+extern "C" void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+                                               void* buffer_private_data);
+
+class Gav1DecodeCVPixelBufferPool {
+ public:
+  static std::unique_ptr<Gav1DecodeCVPixelBufferPool> Create(
+      size_t num_buffers);
+
+  // Not copyable or movable.
+  Gav1DecodeCVPixelBufferPool(const Gav1DecodeCVPixelBufferPool&) = delete;
+  Gav1DecodeCVPixelBufferPool& operator=(const Gav1DecodeCVPixelBufferPool&) =
+      delete;
+
+  ~Gav1DecodeCVPixelBufferPool();
+
+  libgav1::StatusCode OnCVPixelBufferSizeChanged(
+      int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+      int left_border, int right_border, int top_border, int bottom_border,
+      int stride_alignment);
+
+  libgav1::StatusCode GetCVPixelBuffer(int bitdepth,
+                                       libgav1::ImageFormat image_format,
+                                       int width, int height, int left_border,
+                                       int right_border, int top_border,
+                                       int bottom_border, int stride_alignment,
+                                       libgav1::FrameBuffer* frame_buffer);
+  void ReleaseCVPixelBuffer(void* buffer_private_data);
+
+ private:
+  Gav1DecodeCVPixelBufferPool(size_t num_buffers);
+
+  CVPixelBufferPoolRef pool_ = nullptr;
+  const int num_buffers_;
+};
+
+#endif  // LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
diff --git a/examples/ivf_parser.cc b/examples/ivf_parser.cc
new file mode 100644
index 0000000..f8adb14
--- /dev/null
+++ b/examples/ivf_parser.cc
@@ -0,0 +1,96 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/ivf_parser.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include "examples/file_reader_constants.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+size_t ReadLittleEndian16(const uint8_t* const buffer) {
+  size_t value = buffer[1] << 8;
+  value |= buffer[0];
+  return value;
+}
+
+size_t ReadLittleEndian32(const uint8_t* const buffer) {
+  size_t value = buffer[3] << 24;
+  value |= buffer[2] << 16;
+  value |= buffer[1] << 8;
+  value |= buffer[0];
+  return value;
+}
+
+}  // namespace
+
+bool ParseIvfFileHeader(const uint8_t* const header_buffer,
+                        IvfFileHeader* const ivf_file_header) {
+  if (header_buffer == nullptr || ivf_file_header == nullptr) return false;
+
+  if (memcmp(kIvfSignature, header_buffer, 4) != 0) {
+    return false;
+  }
+
+  // Verify header version and length.
+  const size_t ivf_header_version = ReadLittleEndian16(&header_buffer[4]);
+  if (ivf_header_version != kIvfHeaderVersion) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unexpected IVF version");
+  }
+
+  const size_t ivf_header_size = ReadLittleEndian16(&header_buffer[6]);
+  if (ivf_header_size != kIvfFileHeaderSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Invalid IVF file header size");
+    return false;
+  }
+
+  if (memcmp(kAv1FourCcLower, &header_buffer[8], 4) != 0 &&
+      memcmp(kAv1FourCcUpper, &header_buffer[8], 4) != 0) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported codec 4CC");
+    return false;
+  }
+
+  ivf_file_header->width = ReadLittleEndian16(&header_buffer[12]);
+  ivf_file_header->height = ReadLittleEndian16(&header_buffer[14]);
+  ivf_file_header->frame_rate_numerator =
+      ReadLittleEndian32(&header_buffer[16]);
+  ivf_file_header->frame_rate_denominator =
+      ReadLittleEndian32(&header_buffer[20]);
+
+  return true;
+}
+
+bool ParseIvfFrameHeader(const uint8_t* const header_buffer,
+                         IvfFrameHeader* const ivf_frame_header) {
+  if (header_buffer == nullptr || ivf_frame_header == nullptr) return false;
+
+  ivf_frame_header->frame_size = ReadLittleEndian32(header_buffer);
+  if (ivf_frame_header->frame_size > kMaxTemporalUnitSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Temporal Unit size exceeds maximum");
+    return false;
+  }
+
+  ivf_frame_header->timestamp = ReadLittleEndian32(&header_buffer[4]);
+  const uint64_t timestamp_hi =
+      static_cast<uint64_t>(ReadLittleEndian32(&header_buffer[8])) << 32;
+  ivf_frame_header->timestamp |= timestamp_hi;
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/examples/ivf_parser.h b/examples/ivf_parser.h
new file mode 100644
index 0000000..b6bbc59
--- /dev/null
+++ b/examples/ivf_parser.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_IVF_PARSER_H_
+#define LIBGAV1_EXAMPLES_IVF_PARSER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+
+struct IvfFileHeader {
+  IvfFileHeader() = default;
+  IvfFileHeader(const IvfFileHeader& rhs) = default;
+  IvfFileHeader& operator=(const IvfFileHeader& rhs) = default;
+  IvfFileHeader(IvfFileHeader&& rhs) = default;
+  IvfFileHeader& operator=(IvfFileHeader&& rhs) = default;
+
+  size_t width = 0;
+  size_t height = 0;
+  size_t frame_rate_numerator = 0;
+  size_t frame_rate_denominator = 0;
+};
+
+struct IvfFrameHeader {
+  IvfFrameHeader() = default;
+  IvfFrameHeader(const IvfFrameHeader& rhs) = default;
+  IvfFrameHeader& operator=(const IvfFrameHeader& rhs) = default;
+  IvfFrameHeader(IvfFrameHeader&& rhs) = default;
+  IvfFrameHeader& operator=(IvfFrameHeader&& rhs) = default;
+
+  size_t frame_size = 0;
+  int64_t timestamp = 0;
+};
+
+bool ParseIvfFileHeader(const uint8_t* header_buffer,
+                        IvfFileHeader* ivf_file_header);
+
+bool ParseIvfFrameHeader(const uint8_t* header_buffer,
+                         IvfFrameHeader* ivf_frame_header);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_IVF_PARSER_H_
diff --git a/examples/libgav1_examples.cmake b/examples/libgav1_examples.cmake
new file mode 100644
index 0000000..1f949f3
--- /dev/null
+++ b/examples/libgav1_examples.cmake
@@ -0,0 +1,63 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
+  return()
+endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
+set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
+
+set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
+                                "${libgav1_examples}/file_reader.h"
+                                "${libgav1_examples}/file_reader_constants.cc"
+                                "${libgav1_examples}/file_reader_constants.h"
+                                "${libgav1_examples}/file_reader_factory.cc"
+                                "${libgav1_examples}/file_reader_factory.h"
+                                "${libgav1_examples}/file_reader_interface.h"
+                                "${libgav1_examples}/ivf_parser.cc"
+                                "${libgav1_examples}/ivf_parser.h"
+                                "${libgav1_examples}/logging.h")
+
+set(libgav1_file_writer_sources "${libgav1_examples}/file_writer.cc"
+                                "${libgav1_examples}/file_writer.h"
+                                "${libgav1_examples}/logging.h")
+
+set(libgav1_decode_sources "${libgav1_examples}/gav1_decode.cc")
+
+macro(libgav1_add_examples_targets)
+  libgav1_add_library(NAME libgav1_file_reader TYPE OBJECT SOURCES
+                      ${libgav1_file_reader_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_library(NAME libgav1_file_writer TYPE OBJECT SOURCES
+                      ${libgav1_file_writer_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_executable(NAME
+                         gav1_decode
+                         SOURCES
+                         ${libgav1_decode_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_include_paths}
+                         ${libgav1_gtest_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_file_reader
+                         libgav1_file_writer
+                         LIB_DEPS
+                         absl::strings
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_dependency})
+endmacro()
diff --git a/examples/logging.h b/examples/logging.h
new file mode 100644
index 0000000..c0bcad7
--- /dev/null
+++ b/examples/logging.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_LOGGING_H_
+#define LIBGAV1_EXAMPLES_LOGGING_H_
+
+#include <cstddef>
+#include <cstdio>
+
+namespace libgav1 {
+namespace examples {
+
+#if !defined(LIBGAV1_EXAMPLES_ENABLE_LOGGING)
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+  return (offset == 0 || file_name[offset - 1] == '/' ||
+          file_name[offset - 1] == '\\')
+             ? file_name + offset
+             : Basename(file_name, offset - 1);
+}
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string)                              \
+  do {                                                                        \
+    constexpr const char* libgav1_examples_basename =                         \
+        ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1);        \
+    fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
+            __func__, error_string);                                          \
+  } while (false)
+
+#else  // !LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+  do {                                           \
+  } while (false)
+
+#endif  // LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+}  // namespace examples
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_LOGGING_H_
diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc
new file mode 100644
index 0000000..c1a5606
--- /dev/null
+++ b/src/buffer_pool.cc
@@ -0,0 +1,218 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buffer_pool.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+namespace {
+
+// Copies the feature_enabled, feature_data, segment_id_pre_skip, and
+// last_active_segment_id fields of Segmentation.
+void CopySegmentationParameters(const Segmentation& from, Segmentation* to) {
+  memcpy(to->feature_enabled, from.feature_enabled,
+         sizeof(to->feature_enabled));
+  memcpy(to->feature_data, from.feature_data, sizeof(to->feature_data));
+  to->segment_id_pre_skip = from.segment_id_pre_skip;
+  to->last_active_segment_id = from.last_active_segment_id;
+}
+
+}  // namespace
+
+RefCountedBuffer::RefCountedBuffer() = default;
+
+RefCountedBuffer::~RefCountedBuffer() = default;
+
+bool RefCountedBuffer::Realloc(int bitdepth, bool is_monochrome, int width,
+                               int height, int subsampling_x, int subsampling_y,
+                               int left_border, int right_border,
+                               int top_border, int bottom_border) {
+  // The YuvBuffer::Realloc() could call the get frame buffer callback which
+  // will need to be thread safe. So we ensure that we only call Realloc() once
+  // at any given time.
+  std::lock_guard<std::mutex> lock(pool_->mutex_);
+  assert(!buffer_private_data_valid_);
+  if (!yuv_buffer_.Realloc(
+          bitdepth, is_monochrome, width, height, subsampling_x, subsampling_y,
+          left_border, right_border, top_border, bottom_border,
+          pool_->get_frame_buffer_, pool_->callback_private_data_,
+          &buffer_private_data_)) {
+    return false;
+  }
+  buffer_private_data_valid_ = true;
+  return true;
+}
+
+bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
+  upscaled_width_ = frame_header.upscaled_width;
+  frame_width_ = frame_header.width;
+  frame_height_ = frame_header.height;
+  render_width_ = frame_header.render_width;
+  render_height_ = frame_header.render_height;
+  rows4x4_ = frame_header.rows4x4;
+  columns4x4_ = frame_header.columns4x4;
+  if (frame_header.refresh_frame_flags != 0 &&
+      !IsIntraFrame(frame_header.frame_type)) {
+    const int rows4x4_half = DivideBy2(rows4x4_);
+    const int columns4x4_half = DivideBy2(columns4x4_);
+    if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+      return false;
+    }
+  }
+  return segmentation_map_.Allocate(rows4x4_, columns4x4_);
+}
+
+void RefCountedBuffer::SetGlobalMotions(
+    const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions) {
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    static_assert(sizeof(global_motion_[ref].params) ==
+                      sizeof(global_motions[ref].params),
+                  "");
+    memcpy(global_motion_[ref].params, global_motions[ref].params,
+           sizeof(global_motion_[ref].params));
+  }
+}
+
+void RefCountedBuffer::SetFrameContext(const SymbolDecoderContext& context) {
+  frame_context_ = context;
+  frame_context_.ResetIntraFrameYModeCdf();
+  frame_context_.ResetCounters();
+}
+
+void RefCountedBuffer::GetSegmentationParameters(
+    Segmentation* segmentation) const {
+  CopySegmentationParameters(/*from=*/segmentation_, /*to=*/segmentation);
+}
+
+void RefCountedBuffer::SetSegmentationParameters(
+    const Segmentation& segmentation) {
+  CopySegmentationParameters(/*from=*/segmentation, /*to=*/&segmentation_);
+}
+
+void RefCountedBuffer::SetBufferPool(BufferPool* pool) { pool_ = pool; }
+
+void RefCountedBuffer::ReturnToBufferPool(RefCountedBuffer* ptr) {
+  ptr->pool_->ReturnUnusedBuffer(ptr);
+}
+
+BufferPool::BufferPool(
+    FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+    GetFrameBufferCallback get_frame_buffer,
+    ReleaseFrameBufferCallback release_frame_buffer,
+    void* callback_private_data) {
+  if (get_frame_buffer != nullptr) {
+    // on_frame_buffer_size_changed may be null.
+    assert(release_frame_buffer != nullptr);
+    on_frame_buffer_size_changed_ = on_frame_buffer_size_changed;
+    get_frame_buffer_ = get_frame_buffer;
+    release_frame_buffer_ = release_frame_buffer;
+    callback_private_data_ = callback_private_data;
+  } else {
+    on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer_ = GetInternalFrameBuffer;
+    release_frame_buffer_ = ReleaseInternalFrameBuffer;
+    callback_private_data_ = &internal_frame_buffers_;
+  }
+}
+
+BufferPool::~BufferPool() {
+  for (const auto* buffer : buffers_) {
+    if (buffer->in_use_) {
+      assert(false && "RefCountedBuffer still in use at destruction time.");
+      LIBGAV1_DLOG(ERROR, "RefCountedBuffer still in use at destruction time.");
+    }
+    delete buffer;
+  }
+}
+
+bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
+                                          Libgav1ImageFormat image_format,
+                                          int width, int height,
+                                          int left_border, int right_border,
+                                          int top_border, int bottom_border) {
+  if (on_frame_buffer_size_changed_ == nullptr) return true;
+  return on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+                                       image_format, width, height, left_border,
+                                       right_border, top_border, bottom_border,
+                                       /*stride_alignment=*/16) == kStatusOk;
+}
+
+RefCountedBufferPtr BufferPool::GetFreeBuffer() {
+  // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen
+  // from the same thread serially, but the GetFreeBuffer() call in
+  // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same
+  // time. So this function has to be thread safe.
+  // TODO(b/142583029): Investigate if the GetFreeBuffer() call in
+  // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function
+  // need not be thread safe.
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto buffer : buffers_) {
+    if (!buffer->in_use_) {
+      buffer->in_use_ = true;
+      buffer->progress_row_ = -1;
+      buffer->frame_state_ = kFrameStateUnknown;
+      lock.unlock();
+      return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+    }
+  }
+  lock.unlock();
+  auto* const buffer = new (std::nothrow) RefCountedBuffer();
+  if (buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate a new reference counted buffer.");
+    return RefCountedBufferPtr();
+  }
+  buffer->SetBufferPool(this);
+  buffer->in_use_ = true;
+  buffer->progress_row_ = -1;
+  buffer->frame_state_ = kFrameStateUnknown;
+  lock.lock();
+  const bool ok = buffers_.push_back(buffer);
+  lock.unlock();
+  if (!ok) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "Failed to push the new reference counted buffer into the vector.");
+    delete buffer;
+    return RefCountedBufferPtr();
+  }
+  return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+}
+
+void BufferPool::Abort() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto buffer : buffers_) {
+    if (buffer->in_use_) {
+      buffer->Abort();
+    }
+  }
+}
+
+void BufferPool::ReturnUnusedBuffer(RefCountedBuffer* buffer) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  assert(buffer->in_use_);
+  buffer->in_use_ = false;
+  if (buffer->buffer_private_data_valid_) {
+    release_frame_buffer_(callback_private_data_, buffer->buffer_private_data_);
+    buffer->buffer_private_data_valid_ = false;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/buffer_pool.h b/src/buffer_pool.h
new file mode 100644
index 0000000..f35a633
--- /dev/null
+++ b/src/buffer_pool.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_BUFFER_POOL_H_
+#define LIBGAV1_SRC_BUFFER_POOL_H_
+
+#include <array>
+#include <cassert>
+#include <climits>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <cstring>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+class BufferPool;
+
+enum FrameState : uint8_t {
+  kFrameStateUnknown,
+  kFrameStateStarted,
+  kFrameStateParsed,
+  kFrameStateDecoded
+};
+
+// A reference-counted frame buffer. Clients should access it via
+// RefCountedBufferPtr, which manages reference counting transparently.
+class RefCountedBuffer {
+ public:
+  // Not copyable or movable.
+  RefCountedBuffer(const RefCountedBuffer&) = delete;
+  RefCountedBuffer& operator=(const RefCountedBuffer&) = delete;
+
+  // Allocates the YUV buffer. Returns true on success. Returns false on
+  // failure. This function ensures the thread safety of the |get_frame_buffer_|
+  // call (i.e.) only one |get_frame_buffer_| call will happen at a given time.
+  // TODO(b/142583029): In frame parallel mode, we can require the callbacks to
+  // be thread safe so that we can remove the thread safety of this function and
+  // applications can have fine grained locks.
+  //
+  // * |width| and |height| are the image dimensions in pixels.
+  // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+  //   subsampling of the width and height of the chroma planes, respectively.
+  // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+  //   the sizes (in pixels) of the borders on the left, right, top, and
+  //   bottom sides, respectively.
+  //
+  // NOTE: The strides are a multiple of 16. Since the first row in each plane
+  // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+  bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+               int subsampling_x, int subsampling_y, int left_border,
+               int right_border, int top_border, int bottom_border);
+
+  YuvBuffer* buffer() { return &yuv_buffer_; }
+
+  // Returns the buffer private data set by the get frame buffer callback when
+  // it allocated the YUV buffer.
+  void* buffer_private_data() const {
+    assert(buffer_private_data_valid_);
+    return buffer_private_data_;
+  }
+
+  // NOTE: In the current frame, this is the frame_type syntax element in the
+  // frame header. In a reference frame, this implements the RefFrameType array
+  // in the spec.
+  FrameType frame_type() const { return frame_type_; }
+  void set_frame_type(FrameType frame_type) { frame_type_ = frame_type; }
+
+  // The sample position for subsampled streams. This is the
+  // chroma_sample_position syntax element in the sequence header.
+  //
+  // NOTE: The decoder does not use chroma_sample_position, but it needs to be
+  // passed on to the client in DecoderBuffer.
+  ChromaSamplePosition chroma_sample_position() const {
+    return chroma_sample_position_;
+  }
+  void set_chroma_sample_position(ChromaSamplePosition chroma_sample_position) {
+    chroma_sample_position_ = chroma_sample_position;
+  }
+
+  // Whether the frame can be used as show existing frame in the future.
+  bool showable_frame() const { return showable_frame_; }
+  void set_showable_frame(bool value) { showable_frame_ = value; }
+
+  // Sets upscaled_width_, frame_width_, frame_height_, render_width_,
+  // render_height_, rows4x4_ and columns4x4_ from the corresponding fields
+  // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+  // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+  // success, false on failure.
+  bool SetFrameDimensions(const ObuFrameHeader& frame_header);
+
+  int32_t upscaled_width() const { return upscaled_width_; }
+  int32_t frame_width() const { return frame_width_; }
+  int32_t frame_height() const { return frame_height_; }
+  // RenderWidth() and RenderHeight() return the render size, which is a hint
+  // to the application about the desired display size.
+  int32_t render_width() const { return render_width_; }
+  int32_t render_height() const { return render_height_; }
+  int32_t rows4x4() const { return rows4x4_; }
+  int32_t columns4x4() const { return columns4x4_; }
+
+  int spatial_id() const { return spatial_id_; }
+  void set_spatial_id(int value) { spatial_id_ = value; }
+  int temporal_id() const { return temporal_id_; }
+  void set_temporal_id(int value) { temporal_id_ = value; }
+
+  SegmentationMap* segmentation_map() { return &segmentation_map_; }
+  const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
+
+  // Only the |params| field of each GlobalMotion struct should be used.
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& GlobalMotions()
+      const {
+    return global_motion_;
+  }
+  // Saves the GlobalMotion array. Only the |params| field of each GlobalMotion
+  // struct is saved.
+  void SetGlobalMotions(
+      const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions);
+
+  // Returns the saved CDF tables.
+  const SymbolDecoderContext& FrameContext() const { return frame_context_; }
+  // Saves the CDF tables. The intra_frame_y_mode_cdf table is reset to the
+  // default. The last entry in each table, representing the symbol count for
+  // that context, is set to 0.
+  void SetFrameContext(const SymbolDecoderContext& context);
+
+  const std::array<int8_t, kNumReferenceFrameTypes>& loop_filter_ref_deltas()
+      const {
+    return loop_filter_ref_deltas_;
+  }
+  const std::array<int8_t, kLoopFilterMaxModeDeltas>& loop_filter_mode_deltas()
+      const {
+    return loop_filter_mode_deltas_;
+  }
+  // Saves the ref_deltas and mode_deltas arrays in loop_filter.
+  void SetLoopFilterDeltas(const LoopFilter& loop_filter) {
+    loop_filter_ref_deltas_ = loop_filter.ref_deltas;
+    loop_filter_mode_deltas_ = loop_filter.mode_deltas;
+  }
+
+  // Copies the saved values of the following fields to the Segmentation
+  // struct: feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id. The other fields are left unchanged.
+  void GetSegmentationParameters(Segmentation* segmentation) const;
+  // Saves the feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id fields of the Segmentation struct.
+  void SetSegmentationParameters(const Segmentation& segmentation);
+
+  const FilmGrainParams& film_grain_params() const {
+    return film_grain_params_;
+  }
+  void set_film_grain_params(const FilmGrainParams& params) {
+    film_grain_params_ = params;
+  }
+
+  const ReferenceInfo* reference_info() const { return &reference_info_; }
+  ReferenceInfo* reference_info() { return &reference_info_; }
+
+  // This will wake up the WaitUntil*() functions and make them return false.
+  void Abort() {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      abort_ = true;
+    }
+    parsed_condvar_.notify_all();
+    decoded_condvar_.notify_all();
+    progress_row_condvar_.notify_all();
+  }
+
+  void SetFrameState(FrameState frame_state) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      frame_state_ = frame_state;
+    }
+    if (frame_state == kFrameStateParsed) {
+      parsed_condvar_.notify_all();
+    } else if (frame_state == kFrameStateDecoded) {
+      decoded_condvar_.notify_all();
+      progress_row_condvar_.notify_all();
+    }
+  }
+
+  // Sets the progress of this frame to |progress_row| and notifies any threads
+  // that may be waiting on rows <= |progress_row|.
+  void SetProgress(int progress_row) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (progress_row_ >= progress_row) return;
+      progress_row_ = progress_row;
+    }
+    progress_row_condvar_.notify_all();
+  }
+
+  void MarkFrameAsStarted() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (frame_state_ != kFrameStateUnknown) return;
+    frame_state_ = kFrameStateStarted;
+  }
+
+  // All the WaitUntil* functions will return true if the desired wait state was
+  // reached successfully. If the return value is false, then the caller must
+  // assume that the wait was not successful and try to stop whatever they are
+  // doing as early as possible.
+
+  // Waits until the frame has been parsed.
+  bool WaitUntilParsed() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (frame_state_ < kFrameStateParsed && !abort_) {
+      parsed_condvar_.wait(lock);
+    }
+    return !abort_;
+  }
+
+  // Waits until the |progress_row| has been decoded (as indicated either by
+  // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+  // nullptr and will be populated with the value of |progress_row_| after the
+  // wait.
+  //
+  // Typical usage of |progress_row_cache| is as follows:
+  //  * Initialize |*progress_row_cache| to INT_MIN.
+  //  * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+  bool WaitUntil(int progress_row, int* progress_row_cache) {
+    // If |progress_row| is negative, it means that the wait is on the top
+    // border to be available. The top border will be available when row 0 has
+    // been decoded. So we can simply wait on row 0 instead.
+    progress_row = std::max(progress_row, 0);
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (progress_row_ < progress_row && frame_state_ != kFrameStateDecoded &&
+           !abort_) {
+      progress_row_condvar_.wait(lock);
+    }
+    // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+    // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+    // case.
+    *progress_row_cache =
+        (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
+    return !abort_;
+  }
+
+  // Waits until the entire frame has been decoded.
+  bool WaitUntilDecoded() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (frame_state_ != kFrameStateDecoded && !abort_) {
+      decoded_condvar_.wait(lock);
+    }
+    return !abort_;
+  }
+
+ private:
+  friend class BufferPool;
+
+  // Methods for BufferPool:
+  RefCountedBuffer();
+  ~RefCountedBuffer();
+  void SetBufferPool(BufferPool* pool);
+  static void ReturnToBufferPool(RefCountedBuffer* ptr);
+
+  BufferPool* pool_ = nullptr;
+  bool buffer_private_data_valid_ = false;
+  void* buffer_private_data_ = nullptr;
+  YuvBuffer yuv_buffer_;
+  bool in_use_ = false;  // Only used by BufferPool.
+
+  std::mutex mutex_;
+  FrameState frame_state_ = kFrameStateUnknown LIBGAV1_GUARDED_BY(mutex_);
+  int progress_row_ = -1 LIBGAV1_GUARDED_BY(mutex_);
+  // Signaled when progress_row_ is updated or when frame_state_ is set to
+  // kFrameStateDecoded.
+  std::condition_variable progress_row_condvar_;
+  // Signaled when the frame state is set to kFrameStateParsed.
+  std::condition_variable parsed_condvar_;
+  // Signaled when the frame state is set to kFrameStateDecoded.
+  std::condition_variable decoded_condvar_;
+  bool abort_ = false LIBGAV1_GUARDED_BY(mutex_);
+
+  FrameType frame_type_ = kFrameKey;
+  ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
+  bool showable_frame_ = false;
+
+  int32_t upscaled_width_ = 0;
+  int32_t frame_width_ = 0;
+  int32_t frame_height_ = 0;
+  int32_t render_width_ = 0;
+  int32_t render_height_ = 0;
+  int32_t columns4x4_ = 0;
+  int32_t rows4x4_ = 0;
+  int spatial_id_ = 0;
+  int temporal_id_ = 0;
+
+  // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
+  SegmentationMap segmentation_map_;
+
+  // Only the |params| field of each GlobalMotion struct is used.
+  // global_motion_[0] (for kReferenceFrameIntra) is not used.
+  std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion_ = {};
+  SymbolDecoderContext frame_context_;
+  std::array<int8_t, kNumReferenceFrameTypes> loop_filter_ref_deltas_;
+  std::array<int8_t, kLoopFilterMaxModeDeltas> loop_filter_mode_deltas_;
+  // Only the feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id fields of the Segmentation struct are used.
+  //
+  // Note: The spec only requires that we save feature_enabled and
+  // feature_data. Since segment_id_pre_skip and last_active_segment_id depend
+  // on feature_enabled only, we also save their values as an optimization.
+  Segmentation segmentation_ = {};
+  FilmGrainParams film_grain_params_ = {};
+  ReferenceInfo reference_info_;
+};
+
+// RefCountedBufferPtr contains a reference to a RefCountedBuffer.
+//
+// Note: For simplicity, RefCountedBufferPtr is implemented as a
+// std::shared_ptr<RefCountedBuffer>. This requires a heap allocation of the
+// control block for std::shared_ptr. To avoid that heap allocation, we can
+// add a |ref_count_| field to RefCountedBuffer and implement a custom
+// RefCountedBufferPtr class.
+using RefCountedBufferPtr = std::shared_ptr<RefCountedBuffer>;
+
+// BufferPool maintains a pool of RefCountedBuffers.
+class BufferPool {
+ public:
+  BufferPool(FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+             GetFrameBufferCallback get_frame_buffer,
+             ReleaseFrameBufferCallback release_frame_buffer,
+             void* callback_private_data);
+
+  // Not copyable or movable.
+  BufferPool(const BufferPool&) = delete;
+  BufferPool& operator=(const BufferPool&) = delete;
+
+  ~BufferPool();
+
+  LIBGAV1_MUST_USE_RESULT bool OnFrameBufferSizeChanged(
+      int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+      int left_border, int right_border, int top_border, int bottom_border);
+
+  // Finds a free buffer in the buffer pool and returns a reference to the free
+  // buffer. If there is no free buffer, returns a null pointer. This function
+  // is thread safe.
+  RefCountedBufferPtr GetFreeBuffer();
+
+  // Aborts all the buffers that are in use.
+  void Abort();
+
+ private:
+  friend class RefCountedBuffer;
+
+  // Returns an unused buffer to the buffer pool. Called by RefCountedBuffer
+  // only. This function is thread safe.
+  void ReturnUnusedBuffer(RefCountedBuffer* buffer);
+
+  // Used to make the following functions thread safe: GetFreeBuffer(),
+  // ReturnUnusedBuffer(), RefCountedBuffer::Realloc().
+  std::mutex mutex_;
+
+  // Storing a RefCountedBuffer object in a Vector is complicated because of the
+  // copy/move semantics. So the simplest way around that is to store a list of
+  // pointers in the vector.
+  Vector<RefCountedBuffer*> buffers_ LIBGAV1_GUARDED_BY(mutex_);
+  InternalFrameBufferList internal_frame_buffers_;
+
+  // Frame buffer callbacks.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
+  GetFrameBufferCallback get_frame_buffer_;
+  ReleaseFrameBufferCallback release_frame_buffer_;
+  // Private data associated with the frame buffer callbacks.
+  void* callback_private_data_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_BUFFER_POOL_H_
diff --git a/src/decoder.cc b/src/decoder.cc
new file mode 100644
index 0000000..b9e43e0
--- /dev/null
+++ b/src/decoder.cc
@@ -0,0 +1,119 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder.h"
+
+#include <memory>
+#include <new>
+
+#include "src/decoder_impl.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1DecoderCreate(const Libgav1DecoderSettings* settings,
+                                       Libgav1Decoder** decoder_out) {
+  std::unique_ptr<libgav1::Decoder> cxx_decoder(new (std::nothrow)
+                                                    libgav1::Decoder());
+  if (cxx_decoder == nullptr) return kLibgav1StatusOutOfMemory;
+
+  libgav1::DecoderSettings cxx_settings;
+  cxx_settings.threads = settings->threads;
+  cxx_settings.frame_parallel = settings->frame_parallel != 0;
+  cxx_settings.blocking_dequeue = settings->blocking_dequeue != 0;
+  cxx_settings.on_frame_buffer_size_changed =
+      settings->on_frame_buffer_size_changed;
+  cxx_settings.get_frame_buffer = settings->get_frame_buffer;
+  cxx_settings.release_frame_buffer = settings->release_frame_buffer;
+  cxx_settings.release_input_buffer = settings->release_input_buffer;
+  cxx_settings.callback_private_data = settings->callback_private_data;
+  cxx_settings.output_all_layers = settings->output_all_layers != 0;
+  cxx_settings.operating_point = settings->operating_point;
+  cxx_settings.post_filter_mask = settings->post_filter_mask;
+
+  const Libgav1StatusCode status = cxx_decoder->Init(&cxx_settings);
+  if (status == kLibgav1StatusOk) {
+    *decoder_out = reinterpret_cast<Libgav1Decoder*>(cxx_decoder.release());
+  }
+  return status;
+}
+
+void Libgav1DecoderDestroy(Libgav1Decoder* decoder) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  delete cxx_decoder;
+}
+
+Libgav1StatusCode Libgav1DecoderEnqueueFrame(Libgav1Decoder* decoder,
+                                             const uint8_t* data, size_t size,
+                                             int64_t user_private_data,
+                                             void* buffer_private_data) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->EnqueueFrame(data, size, user_private_data,
+                                   buffer_private_data);
+}
+
+Libgav1StatusCode Libgav1DecoderDequeueFrame(
+    Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->DequeueFrame(out_ptr);
+}
+
+Libgav1StatusCode Libgav1DecoderSignalEOS(Libgav1Decoder* decoder) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->SignalEOS();
+}
+
+int Libgav1DecoderGetMaxBitdepth() {
+  return libgav1::Decoder::GetMaxBitdepth();
+}
+
+}  // extern "C"
+
+namespace libgav1 {
+
+Decoder::Decoder() = default;
+
+Decoder::~Decoder() = default;
+
+StatusCode Decoder::Init(const DecoderSettings* const settings) {
+  if (impl_ != nullptr) return kStatusAlready;
+  if (settings != nullptr) settings_ = *settings;
+  return DecoderImpl::Create(&settings_, &impl_);
+}
+
+StatusCode Decoder::EnqueueFrame(const uint8_t* data, const size_t size,
+                                 int64_t user_private_data,
+                                 void* buffer_private_data) {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  return impl_->EnqueueFrame(data, size, user_private_data,
+                             buffer_private_data);
+}
+
+StatusCode Decoder::DequeueFrame(const DecoderBuffer** out_ptr) {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  return impl_->DequeueFrame(out_ptr);
+}
+
+StatusCode Decoder::SignalEOS() {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  // In non-frame-parallel mode, we have to release all the references. This
+  // simply means replacing the |impl_| with a new instance so that all the
+  // existing references are released and the state is cleared.
+  impl_ = nullptr;
+  return DecoderImpl::Create(&settings_, &impl_);
+}
+
+// static.
+int Decoder::GetMaxBitdepth() { return DecoderImpl::GetMaxBitdepth(); }
+
+}  // namespace libgav1
diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc
new file mode 100644
index 0000000..751671d
--- /dev/null
+++ b/src/decoder_impl.cc
@@ -0,0 +1,1661 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/decoder_impl.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <iterator>
+#include <new>
+#include <utility>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/film_grain.h"
+#include "src/frame_buffer_utils.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/prediction_mask.h"
+#include "src/threading_strategy.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/parameter_tree.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaxBlockWidth4x4 = 32;
+constexpr int kMaxBlockHeight4x4 = 32;
+
+// Computes the bottom border size in pixels. If CDEF, loop restoration or
+// SuperRes is enabled, adds extra border pixels to facilitate those steps to
+// happen nearly in-place (a few extra rows instead of an entire frame buffer).
+// The logic in this function should match the corresponding logic for
+// |vertical_shift| in the PostFilter constructor.
+int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
+                          const bool do_superres, const int subsampling_y) {
+  int extra_border = 0;
+  if (do_cdef) {
+    extra_border += kCdefBorder;
+  } else if (do_restoration) {
+    // If CDEF is enabled, loop restoration is safe without extra border.
+    extra_border += kRestorationVerticalBorder;
+  }
+  if (do_superres) extra_border += kSuperResVerticalBorder;
+  // Double the number of extra bottom border pixels if the bottom border will
+  // be subsampled.
+  extra_border <<= subsampling_y;
+  return Align(kBorderPixels + extra_border, 2);  // Must be a multiple of 2.
+}
+
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+                            int count) {
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    frame_scratch_buffer->tile_decoding_failed = true;
+  }
+  std::condition_variable* const condvars =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  for (int i = 0; i < count; ++i) {
+    condvars[i].notify_one();
+  }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
+ public:
+  FrameScratchBufferReleaser(
+      FrameScratchBufferPool* frame_scratch_buffer_pool,
+      std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+      : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+        frame_scratch_buffer_(frame_scratch_buffer) {}
+  ~FrameScratchBufferReleaser() {
+    frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+  }
+
+ private:
+  FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+  std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
+};
+
+// Sets the |frame|'s segmentation map for two cases. The third case is handled
+// in Tile::DecodeBlock().
+void SetSegmentationMap(const ObuFrameHeader& frame_header,
+                        const SegmentationMap* prev_segment_ids,
+                        RefCountedBuffer* const frame) {
+  if (!frame_header.segmentation.enabled) {
+    // All segment_id's are 0.
+    frame->segmentation_map()->Clear();
+  } else if (!frame_header.segmentation.update_map) {
+    // Copy from prev_segment_ids.
+    if (prev_segment_ids == nullptr) {
+      // Treat a null prev_segment_ids pointer as if it pointed to a
+      // segmentation map containing all 0s.
+      frame->segmentation_map()->Clear();
+    } else {
+      frame->segmentation_map()->CopyFrom(*prev_segment_ids);
+    }
+  }
+}
+
+StatusCode DecodeTilesNonFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter) {
+  // Decode in superblock row order.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory;
+  for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4) {
+    for (const auto& tile_ptr : tiles) {
+      if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+              row4x4, tile_scratch_buffer.get())) {
+        return kLibgav1StatusUnknownError;
+      }
+    }
+    post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
+  }
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(tile_scratch_buffer));
+  return kStatusOk;
+}
+
+StatusCode DecodeTilesThreadedNonFrameParallel(
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter,
+    BlockingCounterWithStatus* const pending_tiles) {
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  const int num_workers = threading_strategy.tile_thread_count();
+  BlockingCounterWithStatus pending_workers(num_workers);
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  bool tile_decoding_failed = false;
+  // Submit tile decoding jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count,
+                                                     &tile_counter,
+                                                     &pending_workers,
+                                                     &pending_tiles]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->ParseAndDecode()) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        } else {
+          pending_tiles->Decrement(false);
+        }
+      }
+      pending_workers.Decrement(!failed);
+    });
+  }
+  // Have the current thread partake in tile decoding.
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!tile_decoding_failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->ParseAndDecode()) {
+        LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+        tile_decoding_failed = true;
+      }
+    } else {
+      pending_tiles->Decrement(false);
+    }
+  }
+  // Wait until all the workers are done. This ensures that all the tiles have
+  // been parsed.
+  tile_decoding_failed |= !pending_workers.Wait();
+  // Wait until all the tiles have been decoded.
+  tile_decoding_failed |= !pending_tiles->Wait();
+  if (tile_decoding_failed) return kStatusUnknownError;
+  assert(threading_strategy.post_filter_thread_pool() != nullptr);
+  post_filter->ApplyFilteringThreaded();
+  return kStatusOk;
+}
+
+StatusCode DecodeTilesFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  for (const auto& tile : tiles) {
+    if (!tile->Parse()) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number());
+      return kStatusUnknownError;
+    }
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  // Mark frame as parsed.
+  current_frame->SetFrameState(kFrameStateParsed);
+  std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (tile_scratch_buffer == nullptr) {
+    return kStatusOutOfMemory;
+  }
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  // Decode in superblock row order (inter prediction in the Tile class will
+  // block until the required superblocks in the reference frame are decoded).
+  for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4) {
+    for (const auto& tile_ptr : tiles) {
+      if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+              row4x4, tile_scratch_buffer.get())) {
+        LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n",
+                     tile_ptr->number());
+        return kStatusUnknownError;
+      }
+    }
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Mark frame as decoded (we no longer care about row-level progress since the
+  // entire frame has been decoded).
+  current_frame->SetFrameState(kFrameStateDecoded);
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(tile_scratch_buffer));
+  return kStatusOk;
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+// deblocking filter for tile boundaries for the superblock row at |row4x4|.
+void ApplyDeblockingFilterForTileBoundaries(
+    PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+    const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+    int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+  // Apply vertical deblock filtering for the first 64 columns of each tile.
+  for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+    const Tile& tile = *tile_row_base[tile_column];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+  }
+  if (decode_entire_tiles_in_worker_threads &&
+      row4x4 == tile_row_base[0]->row4x4_start()) {
+    // This is the first superblock row of a tile row. In this case, apply
+    // horizontal deblock filtering for the entire superblock row.
+    post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+                                    frame_header.columns4x4, block_width4x4);
+  } else {
+    // Apply horizontal deblock filtering for the first 64 columns of the
+    // first tile.
+    const Tile& first_tile = *tile_row_base[0];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+        first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // previous tile and the first 64 columns of the current tile.
+    for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+      const Tile& tile = *tile_row_base[tile_column];
+      // If the previous tile has more than 64 columns, then include those
+      // for the horizontal deblock.
+      const Tile& previous_tile = *tile_row_base[tile_column - 1];
+      const int column4x4_start =
+          tile.column4x4_start() -
+          ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+            previous_tile.column4x4_start())
+               ? kNum4x4InLoopFilterUnit
+               : 0);
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    }
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // last tile.
+    const Tile& last_tile = *tile_row_base[tile_columns - 1];
+    // Identify the last column4x4 value and do horizontal filtering for
+    // that column4x4. The value of last column4x4 is the nearest multiple
+    // of 16 that is before tile.column4x4_end().
+    const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+    // If column4x4_start is the same as tile.column4x4_start() then it
+    // means that the last tile has <= 64 columns. So there is nothing left
+    // to deblock (since it was already deblocked in the loop above).
+    if (column4x4_start != last_tile.column4x4_start()) {
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          last_tile.column4x4_end(), block_width4x4);
+    }
+  }
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+// superblock row starting at |row4x4| for tile at index |tile_index| in the
+// list of tiles |tiles|. If the decoding is successful, then it does the
+// following:
+//   * Schedule the next superblock row in the current tile column for decoding
+//     (the next superblock row may be in a different tile than the current
+//     one).
+//   * If an entire superblock row of the frame has been decoded, it notifies
+//     the waiters (if there are any).
+void DecodeSuperBlockRowInTile(
+    const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+    const int superblock_size4x4, const int tile_columns,
+    const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (scratch_buffer == nullptr) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  Tile& tile = *tiles[tile_index];
+  const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+      row4x4, scratch_buffer.get());
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(scratch_buffer));
+  if (!ok) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  if (post_filter->DoDeblock()) {
+    // Apply vertical deblock filtering for all the columns in this tile except
+    // for the first 64 columns.
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+        superblock_size4x4);
+    // Apply horizontal deblock filtering for all the columns in this tile
+    // except for the first and the last 64 columns.
+    // Note about the last tile of each row: For the last tile, column4x4_end
+    // may not be a multiple of 16. In that case it is still okay to simply
+    // subtract 16 since ApplyDeblockFilter() will only do the filters in
+    // increments of 64 columns (or 32 columns for chroma with subsampling).
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+        tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+  }
+  const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+  const int index = row4x4 >> superblock_size4x4_log2;
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  bool notify;
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    notify = ++superblock_row_progress[index] == tile_columns;
+  }
+  if (notify) {
+    // We are done decoding this superblock row. Notify the post filtering
+    // thread.
+    superblock_row_progress_condvar[index].notify_one();
+  }
+  // Schedule the next superblock row (if one exists).
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  const int next_row4x4 = row4x4 + superblock_size4x4;
+  if (!tile.IsRow4x4Inside(next_row4x4)) {
+    tile_index += tile_columns;
+  }
+  if (tile_index >= tiles.size()) return;
+  pending_jobs->IncrementBy(1);
+  thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4,
+                        tile_columns, superblock_rows, frame_scratch_buffer,
+                        post_filter, pending_jobs]() {
+    DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+                              superblock_size4x4, tile_columns, superblock_rows,
+                              frame_scratch_buffer, post_filter, pending_jobs);
+    pending_jobs->Decrement();
+  });
+}
+
+StatusCode DecodeTilesThreadedFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  const int num_workers = thread_pool.num_threads();
+  BlockingCounterWithStatus parse_workers(num_workers);
+  // Submit tile parsing jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Parse()) {
+            LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        }
+      }
+      parse_workers.Decrement(!failed);
+    });
+  }
+
+  // Have the current thread participate in parsing.
+  bool failed = false;
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->Parse()) {
+        LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+        failed = true;
+      }
+    }
+  }
+
+  // Wait until all the parse workers are done. This ensures that all the tiles
+  // have been parsed.
+  if (!parse_workers.Wait() || failed) {
+    return kLibgav1StatusUnknownError;
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  current_frame->SetFrameState(kFrameStateParsed);
+
+  // Decode the frame.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header.use_128x128_superblock ? 5 : 4;
+  const int superblock_rows =
+      (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+  if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+      !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+          superblock_rows)) {
+    return kLibgav1StatusOutOfMemory;
+  }
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  memset(superblock_row_progress, 0,
+         superblock_rows * sizeof(superblock_row_progress[0]));
+  frame_scratch_buffer->tile_decoding_failed = false;
+  const int tile_columns = frame_header.tile_info.tile_columns;
+  const bool decode_entire_tiles_in_worker_threads =
+      num_workers >= tile_columns;
+  BlockingCounter pending_jobs(
+      decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+  if (decode_entire_tiles_in_worker_threads) {
+    // Submit tile decoding jobs to the thread pool.
+    tile_counter = 0;
+    for (int i = 0; i < num_workers; ++i) {
+      thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+                            frame_scratch_buffer, superblock_rows]() {
+        bool failed = false;
+        int index;
+        while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+               tile_count) {
+          if (failed) continue;
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Decode(
+                  &frame_scratch_buffer->superblock_row_mutex,
+                  frame_scratch_buffer->superblock_row_progress.get(),
+                  frame_scratch_buffer->superblock_row_progress_condvar
+                      .get())) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+            SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+          }
+        }
+        pending_jobs.Decrement();
+      });
+    }
+  } else {
+    // Schedule the jobs for first tile row.
+    for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+      thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns,
+                            superblock_rows, frame_scratch_buffer, post_filter,
+                            &pending_jobs]() {
+        DecodeSuperBlockRowInTile(
+            tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+            frame_scratch_buffer, post_filter, &pending_jobs);
+        pending_jobs.Decrement();
+      });
+    }
+  }
+
+  // Current thread will do the post filters.
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+  for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4, ++index) {
+    if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+      tile_row_base += tile_columns;
+    }
+    {
+      std::unique_lock<std::mutex> lock(
+          frame_scratch_buffer->superblock_row_mutex);
+      while (superblock_row_progress[index] != tile_columns &&
+             !frame_scratch_buffer->tile_decoding_failed) {
+        superblock_row_progress_condvar[index].wait(lock);
+      }
+      if (frame_scratch_buffer->tile_decoding_failed) break;
+    }
+    if (post_filter->DoDeblock()) {
+      // Apply deblocking filter for the tile boundaries of this superblock row.
+      // The deblocking filter for the internal blocks will be applied in the
+      // tile worker threads. In this thread, we will only have to apply
+      // deblocking filter for the tile boundaries.
+      ApplyDeblockingFilterForTileBoundaries(
+          post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+          tile_columns, decode_entire_tiles_in_worker_threads);
+    }
+    // Apply all the post filters other than deblocking.
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/false);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Wait until all the pending jobs are done. This ensures that all the tiles
+  // have been decoded and wrapped up.
+  pending_jobs.Wait();
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    if (frame_scratch_buffer->tile_decoding_failed) {
+      return kLibgav1StatusUnknownError;
+    }
+  }
+
+  current_frame->SetFrameState(kFrameStateDecoded);
+  return kStatusOk;
+}
+
+}  // namespace
+
+// static
+StatusCode DecoderImpl::Create(const DecoderSettings* settings,
+                               std::unique_ptr<DecoderImpl>* output) {
+  if (settings->threads <= 0) {
+    LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads);
+    return kStatusInvalidArgument;
+  }
+  if (settings->frame_parallel) {
+    if (settings->release_input_buffer == nullptr) {
+      LIBGAV1_DLOG(ERROR,
+                   "release_input_buffer callback must not be null when "
+                   "frame_parallel is true.");
+      return kStatusInvalidArgument;
+    }
+  }
+  std::unique_ptr<DecoderImpl> impl(new (std::nothrow) DecoderImpl(settings));
+  if (impl == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl.");
+    return kStatusOutOfMemory;
+  }
+  const StatusCode status = impl->Init();
+  if (status != kStatusOk) return status;
+  *output = std::move(impl);
+  return kStatusOk;
+}
+
+DecoderImpl::DecoderImpl(const DecoderSettings* settings)
+    : buffer_pool_(settings->on_frame_buffer_size_changed,
+                   settings->get_frame_buffer, settings->release_frame_buffer,
+                   settings->callback_private_data),
+      settings_(*settings) {
+  dsp::DspInit();
+}
+
+DecoderImpl::~DecoderImpl() {
+  // Clean up and wait until all the threads have stopped. We just have to pass
+  // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the
+  // path that clears all the threads and structs.
+  SignalFailure(kStatusUnknownError);
+  // Release any other frame buffer references that we may be holding on to.
+  ReleaseOutputFrame();
+  output_frame_queue_.Clear();
+  for (auto& reference_frame : state_.reference_frame) {
+    reference_frame = nullptr;
+  }
+}
+
+StatusCode DecoderImpl::Init() {
+  if (!GenerateWedgeMask(&wedge_masks_)) {
+    LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
+    return kStatusOutOfMemory;
+  }
+  if (!output_frame_queue_.Init(kMaxLayers)) {
+    LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+    const uint8_t* data, size_t size) {
+  is_frame_parallel_ = false;
+  if (settings_.frame_parallel) {
+    DecoderState state;
+    std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+        data, size, settings_.operating_point, &buffer_pool_, &state));
+    if (obu == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+      return kStatusOutOfMemory;
+    }
+    RefCountedBufferPtr current_frame;
+    const StatusCode status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    current_frame = nullptr;
+    // We assume that the first frame that was parsed will contain the frame
+    // header. This assumption is usually true in practice. So we will simply
+    // not use frame parallel mode if this is not the case.
+    if (settings_.threads > 1 &&
+        !InitializeThreadPoolsForFrameParallel(
+            settings_.threads, obu->frame_header().tile_info.tile_count,
+            obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+            &frame_scratch_buffer_pool_)) {
+      return kStatusOutOfMemory;
+    }
+  }
+  const int max_allowed_frames =
+      (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1;
+  assert(max_allowed_frames > 0);
+  if (!temporal_units_.Init(max_allowed_frames)) {
+    LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
+    return kStatusOutOfMemory;
+  }
+  is_frame_parallel_ = frame_thread_pool_ != nullptr;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
+                                     int64_t user_private_data,
+                                     void* buffer_private_data) {
+  if (data == nullptr || size == 0) return kStatusInvalidArgument;
+  if (HasFailure()) return kStatusUnknownError;
+  if (!seen_first_frame_) {
+    seen_first_frame_ = true;
+    const StatusCode status =
+        InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+    if (status != kStatusOk) {
+      return SignalFailure(status);
+    }
+  }
+  if (temporal_units_.Full()) {
+    return kStatusTryAgain;
+  }
+  if (is_frame_parallel_) {
+    return ParseAndSchedule(data, size, user_private_data, buffer_private_data);
+  }
+  TemporalUnit temporal_unit(data, size, user_private_data,
+                             buffer_private_data);
+  temporal_units_.Push(std::move(temporal_unit));
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::SignalFailure(StatusCode status) {
+  if (status == kStatusOk || status == kStatusTryAgain) return status;
+  // Set the |failure_status_| first so that any pending jobs in
+  // |frame_thread_pool_| will exit right away when the thread pool is being
+  // released below.
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    failure_status_ = status;
+  }
+  // Make sure all waiting threads exit.
+  buffer_pool_.Abort();
+  frame_thread_pool_ = nullptr;
+  while (!temporal_units_.Empty()) {
+    if (settings_.release_input_buffer != nullptr) {
+      settings_.release_input_buffer(
+          settings_.callback_private_data,
+          temporal_units_.Front().buffer_private_data);
+    }
+    temporal_units_.Pop();
+  }
+  return status;
+}
+
+// DequeueFrame() follows the following policy to avoid holding unnecessary
+// frame buffer references in output_frame_: output_frame_ must be null when
+// DequeueFrame() returns false.
+StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
+  if (out_ptr == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr.");
+    return kStatusInvalidArgument;
+  }
+  // We assume a call to DequeueFrame() indicates that the caller is no longer
+  // using the previous output frame, so we can release it.
+  ReleaseOutputFrame();
+  if (temporal_units_.Empty()) {
+    // No input frames to decode.
+    *out_ptr = nullptr;
+    return kStatusNothingToDequeue;
+  }
+  TemporalUnit& temporal_unit = temporal_units_.Front();
+  if (!is_frame_parallel_) {
+    // If |output_frame_queue_| is not empty, then return the first frame from
+    // that queue.
+    if (!output_frame_queue_.Empty()) {
+      RefCountedBufferPtr frame = std::move(output_frame_queue_.Front());
+      output_frame_queue_.Pop();
+      buffer_.user_private_data = temporal_unit.user_private_data;
+      if (output_frame_queue_.Empty()) {
+        temporal_units_.Pop();
+      }
+      const StatusCode status = CopyFrameToOutputBuffer(frame);
+      if (status != kStatusOk) {
+        return status;
+      }
+      *out_ptr = &buffer_;
+      return kStatusOk;
+    }
+    // Decode the next available temporal unit and return.
+    const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr);
+    if (status != kStatusOk) {
+      // In case of failure, discard all the output frames that we may be
+      // holding on references to.
+      output_frame_queue_.Clear();
+    }
+    if (settings_.release_input_buffer != nullptr) {
+      settings_.release_input_buffer(settings_.callback_private_data,
+                                     temporal_unit.buffer_private_data);
+    }
+    if (output_frame_queue_.Empty()) {
+      temporal_units_.Pop();
+    }
+    return status;
+  }
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (settings_.blocking_dequeue) {
+      while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        decoded_condvar_.wait(lock);
+      }
+    } else {
+      if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        return kStatusTryAgain;
+      }
+    }
+    if (failure_status_ != kStatusOk) {
+      const StatusCode failure_status = failure_status_;
+      lock.unlock();
+      return SignalFailure(failure_status);
+    }
+  }
+  if (settings_.release_input_buffer != nullptr &&
+      !temporal_unit.released_input_buffer) {
+    temporal_unit.released_input_buffer = true;
+    settings_.release_input_buffer(settings_.callback_private_data,
+                                   temporal_unit.buffer_private_data);
+  }
+  if (temporal_unit.status != kStatusOk) {
+    temporal_units_.Pop();
+    return SignalFailure(temporal_unit.status);
+  }
+  if (!temporal_unit.has_displayable_frame) {
+    *out_ptr = nullptr;
+    temporal_units_.Pop();
+    return kStatusOk;
+  }
+  assert(temporal_unit.output_layer_count > 0);
+  StatusCode status = CopyFrameToOutputBuffer(
+      temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame);
+  temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame =
+      nullptr;
+  if (status != kStatusOk) {
+    temporal_units_.Pop();
+    return SignalFailure(status);
+  }
+  buffer_.user_private_data = temporal_unit.user_private_data;
+  *out_ptr = &buffer_;
+  if (--temporal_unit.output_layer_count == 0) {
+    temporal_units_.Pop();
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
+                                         int64_t user_private_data,
+                                         void* buffer_private_data) {
+  TemporalUnit temporal_unit(data, size, user_private_data,
+                             buffer_private_data);
+  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+      temporal_unit.data, temporal_unit.size, settings_.operating_point,
+      &buffer_pool_, &state_));
+  if (obu == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+    return kStatusOutOfMemory;
+  }
+  if (has_sequence_header_) {
+    obu->set_sequence_header(sequence_header_);
+  }
+  StatusCode status;
+  int position_in_temporal_unit = 0;
+  while (obu->HasData()) {
+    RefCountedBufferPtr current_frame;
+    status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+      LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (IsNewSequenceHeader(*obu)) {
+      const ObuSequenceHeader& sequence_header = obu->sequence_header();
+      const Libgav1ImageFormat image_format =
+          ComposeImageFormat(sequence_header.color_config.is_monochrome,
+                             sequence_header.color_config.subsampling_x,
+                             sequence_header.color_config.subsampling_y);
+      const int max_bottom_border = GetBottomBorderPixels(
+          /*do_cdef=*/true, /*do_restoration=*/true,
+          /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+      // TODO(vigneshv): This may not be the right place to call this callback
+      // for the frame parallel case. Investigate and fix it.
+      if (!buffer_pool_.OnFrameBufferSizeChanged(
+              sequence_header.color_config.bitdepth, image_format,
+              sequence_header.max_frame_width, sequence_header.max_frame_height,
+              kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+        LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+        return kStatusUnknownError;
+      }
+    }
+    // This can happen when there are multiple spatial/temporal layers and if
+    // all the layers are outside the current operating point.
+    if (current_frame == nullptr) {
+      continue;
+    }
+    // Note that we cannot set EncodedFrame.temporal_unit here. It will be set
+    // in the code below after |temporal_unit| is std::move'd into the
+    // |temporal_units_| queue.
+    if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame,
+                                           position_in_temporal_unit++)) {
+      LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed.");
+      return kStatusOutOfMemory;
+    }
+    state_.UpdateReferenceFrames(current_frame,
+                                 obu->frame_header().refresh_frame_flags);
+  }
+  // This function cannot fail after this point. So it is okay to move the
+  // |temporal_unit| into |temporal_units_| queue.
+  temporal_units_.Push(std::move(temporal_unit));
+  if (temporal_units_.Back().frames.empty()) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    temporal_units_.Back().has_displayable_frame = false;
+    temporal_units_.Back().decoded = true;
+    return kStatusOk;
+  }
+  for (auto& frame : temporal_units_.Back().frames) {
+    EncodedFrame* const encoded_frame = &frame;
+    encoded_frame->temporal_unit = &temporal_units_.Back();
+    frame_thread_pool_->Schedule([this, encoded_frame]() {
+      if (HasFailure()) return;
+      const StatusCode status = DecodeFrame(encoded_frame);
+      encoded_frame->state = {};
+      encoded_frame->frame = nullptr;
+      TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (failure_status_ != kStatusOk) return;
+      // temporal_unit's status defaults to kStatusOk. So we need to set it only
+      // on error. If |failure_status_| is not kStatusOk at this point, it means
+      // that there has already been a failure. So we don't care about this
+      // subsequent failure.  We will simply return the error code of the first
+      // failure.
+      if (status != kStatusOk) {
+        temporal_unit.status = status;
+        if (failure_status_ == kStatusOk) {
+          failure_status_ = status;
+        }
+      }
+      temporal_unit.decoded =
+          ++temporal_unit.decoded_count == temporal_unit.frames.size();
+      if (temporal_unit.decoded && settings_.output_all_layers &&
+          temporal_unit.output_layer_count > 1) {
+        std::sort(
+            temporal_unit.output_layers,
+            temporal_unit.output_layers + temporal_unit.output_layer_count);
+      }
+      if (temporal_unit.decoded || failure_status_ != kStatusOk) {
+        decoded_condvar_.notify_one();
+      }
+    });
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
+  const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header;
+  const ObuFrameHeader& frame_header = encoded_frame->frame_header;
+  RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
+
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+  StatusCode status;
+  if (!frame_header.show_existing_frame) {
+    if (encoded_frame->tile_buffers.empty()) {
+      // This means that the last call to ParseOneFrame() did not actually
+      // have any tile groups. This could happen in rare cases (for example,
+      // if there is a Metadata OBU after the TileGroup OBU). We currently do
+      // not have a reason to handle those cases, so we simply continue.
+      return kStatusOk;
+    }
+    status = DecodeTiles(sequence_header, frame_header,
+                         encoded_frame->tile_buffers, encoded_frame->state,
+                         frame_scratch_buffer.get(), current_frame.get());
+    if (status != kStatusOk) {
+      return status;
+    }
+  } else {
+    if (!current_frame->WaitUntilDecoded()) {
+      return kStatusUnknownError;
+    }
+  }
+  if (!frame_header.show_frame && !frame_header.show_existing_frame) {
+    // This frame is not displayable. Not an error.
+    return kStatusOk;
+  }
+  RefCountedBufferPtr film_grain_frame;
+  status = ApplyFilmGrain(
+      sequence_header, frame_header, current_frame, &film_grain_frame,
+      frame_scratch_buffer->threading_strategy.thread_pool());
+  if (status != kStatusOk) {
+    return status;
+  }
+
+  TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) {
+    assert(temporal_unit.output_frame_position >= 0);
+    // A displayable frame was already found in this temporal unit. This can
+    // happen if there are multiple spatial/temporal layers. Since
+    // |settings_.output_all_layers| is false, we will output only the last
+    // displayable frame.
+    if (temporal_unit.output_frame_position >
+        encoded_frame->position_in_temporal_unit) {
+      return kStatusOk;
+    }
+    // Replace any output frame that we may have seen before with the current
+    // frame.
+    assert(temporal_unit.output_layer_count == 1);
+    --temporal_unit.output_layer_count;
+  }
+  temporal_unit.has_displayable_frame = true;
+  temporal_unit.output_layers[temporal_unit.output_layer_count].frame =
+      std::move(film_grain_frame);
+  temporal_unit.output_layers[temporal_unit.output_layer_count]
+      .position_in_temporal_unit = encoded_frame->position_in_temporal_unit;
+  ++temporal_unit.output_layer_count;
+  temporal_unit.output_frame_position =
+      encoded_frame->position_in_temporal_unit;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+                                           const DecoderBuffer** out_ptr) {
+  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+      temporal_unit.data, temporal_unit.size, settings_.operating_point,
+      &buffer_pool_, &state_));
+  if (obu == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+    return kStatusOutOfMemory;
+  }
+  if (has_sequence_header_) {
+    obu->set_sequence_header(sequence_header_);
+  }
+  StatusCode status;
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+  while (obu->HasData()) {
+    RefCountedBufferPtr current_frame;
+    status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+      LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (IsNewSequenceHeader(*obu)) {
+      const ObuSequenceHeader& sequence_header = obu->sequence_header();
+      const Libgav1ImageFormat image_format =
+          ComposeImageFormat(sequence_header.color_config.is_monochrome,
+                             sequence_header.color_config.subsampling_x,
+                             sequence_header.color_config.subsampling_y);
+      const int max_bottom_border = GetBottomBorderPixels(
+          /*do_cdef=*/true, /*do_restoration=*/true,
+          /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+      if (!buffer_pool_.OnFrameBufferSizeChanged(
+              sequence_header.color_config.bitdepth, image_format,
+              sequence_header.max_frame_width, sequence_header.max_frame_height,
+              kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+        LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+        return kStatusUnknownError;
+      }
+    }
+    if (!obu->frame_header().show_existing_frame) {
+      if (obu->tile_buffers().empty()) {
+        // This means that the last call to ParseOneFrame() did not actually
+        // have any tile groups. This could happen in rare cases (for example,
+        // if there is a Metadata OBU after the TileGroup OBU). We currently do
+        // not have a reason to handle those cases, so we simply continue.
+        continue;
+      }
+      status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
+                           obu->tile_buffers(), state_,
+                           frame_scratch_buffer.get(), current_frame.get());
+      if (status != kStatusOk) {
+        return status;
+      }
+    }
+    state_.UpdateReferenceFrames(current_frame,
+                                 obu->frame_header().refresh_frame_flags);
+    if (obu->frame_header().show_frame ||
+        obu->frame_header().show_existing_frame) {
+      if (!output_frame_queue_.Empty() && !settings_.output_all_layers) {
+        // There is more than one displayable frame in the current operating
+        // point and |settings_.output_all_layers| is false. In this case, we
+        // simply return the last displayable frame as the output frame and
+        // ignore the rest.
+        assert(output_frame_queue_.Size() == 1);
+        output_frame_queue_.Pop();
+      }
+      RefCountedBufferPtr film_grain_frame;
+      status = ApplyFilmGrain(
+          obu->sequence_header(), obu->frame_header(), current_frame,
+          &film_grain_frame,
+          frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
+      if (status != kStatusOk) return status;
+      output_frame_queue_.Push(std::move(film_grain_frame));
+    }
+  }
+  if (output_frame_queue_.Empty()) {
+    // No displayable frame in the temporal unit. Not an error.
+    *out_ptr = nullptr;
+    return kStatusOk;
+  }
+  status = CopyFrameToOutputBuffer(output_frame_queue_.Front());
+  output_frame_queue_.Pop();
+  if (status != kStatusOk) {
+    return status;
+  }
+  buffer_.user_private_data = temporal_unit.user_private_data;
+  *out_ptr = &buffer_;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::CopyFrameToOutputBuffer(
+    const RefCountedBufferPtr& frame) {
+  YuvBuffer* yuv_buffer = frame->buffer();
+
+  buffer_.chroma_sample_position = frame->chroma_sample_position();
+
+  if (yuv_buffer->is_monochrome()) {
+    buffer_.image_format = kImageFormatMonochrome400;
+  } else {
+    if (yuv_buffer->subsampling_x() == 0 && yuv_buffer->subsampling_y() == 0) {
+      buffer_.image_format = kImageFormatYuv444;
+    } else if (yuv_buffer->subsampling_x() == 1 &&
+               yuv_buffer->subsampling_y() == 0) {
+      buffer_.image_format = kImageFormatYuv422;
+    } else if (yuv_buffer->subsampling_x() == 1 &&
+               yuv_buffer->subsampling_y() == 1) {
+      buffer_.image_format = kImageFormatYuv420;
+    } else {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid chroma subsampling values: cannot determine buffer "
+                   "image format.");
+      return kStatusInvalidArgument;
+    }
+  }
+  buffer_.color_range = sequence_header_.color_config.color_range;
+  buffer_.color_primary = sequence_header_.color_config.color_primary;
+  buffer_.transfer_characteristics =
+      sequence_header_.color_config.transfer_characteristics;
+  buffer_.matrix_coefficients =
+      sequence_header_.color_config.matrix_coefficients;
+
+  buffer_.bitdepth = yuv_buffer->bitdepth();
+  const int num_planes =
+      yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes;
+  int plane = kPlaneY;
+  for (; plane < num_planes; ++plane) {
+    buffer_.stride[plane] = yuv_buffer->stride(plane);
+    buffer_.plane[plane] = yuv_buffer->data(plane);
+    buffer_.displayed_width[plane] = yuv_buffer->width(plane);
+    buffer_.displayed_height[plane] = yuv_buffer->height(plane);
+  }
+  for (; plane < kMaxPlanes; ++plane) {
+    buffer_.stride[plane] = 0;
+    buffer_.plane[plane] = nullptr;
+    buffer_.displayed_width[plane] = 0;
+    buffer_.displayed_height[plane] = 0;
+  }
+  buffer_.spatial_id = frame->spatial_id();
+  buffer_.temporal_id = frame->temporal_id();
+  buffer_.buffer_private_data = frame->buffer_private_data();
+  output_frame_ = frame;
+  return kStatusOk;
+}
+
+void DecoderImpl::ReleaseOutputFrame() {
+  for (auto& plane : buffer_.plane) {
+    plane = nullptr;
+  }
+  output_frame_ = nullptr;
+}
+
+StatusCode DecoderImpl::DecodeTiles(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header, const Vector<TileBuffer>& tile_buffers,
+    const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+    RefCountedBuffer* const current_frame) {
+  frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
+      sequence_header.color_config.bitdepth);
+  if (!frame_scratch_buffer->loop_restoration_info.Reset(
+          &frame_header.loop_restoration, frame_header.upscaled_width,
+          frame_header.height, sequence_header.color_config.subsampling_x,
+          sequence_header.color_config.subsampling_y,
+          sequence_header.color_config.is_monochrome)) {
+    LIBGAV1_DLOG(ERROR,
+                 "Failed to allocate memory for loop restoration info units.");
+    return kStatusOutOfMemory;
+  }
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  if (!is_frame_parallel_ &&
+      !threading_strategy.Reset(frame_header, settings_.threads)) {
+    return kStatusOutOfMemory;
+  }
+  const bool do_cdef =
+      PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
+  const int num_planes = sequence_header.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  const bool do_restoration = PostFilter::DoRestoration(
+      frame_header.loop_restoration, settings_.post_filter_mask, num_planes);
+  const bool do_superres =
+      PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
+  // Use kBorderPixels for the left, right, and top borders. Only the bottom
+  // border may need to be bigger. Cdef border is needed only if we apply Cdef
+  // without multithreading.
+  const int bottom_border = GetBottomBorderPixels(
+      do_cdef && threading_strategy.post_filter_thread_pool() == nullptr,
+      do_restoration, do_superres, sequence_header.color_config.subsampling_y);
+  current_frame->set_chroma_sample_position(
+      sequence_header.color_config.chroma_sample_position);
+  if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
+                              sequence_header.color_config.is_monochrome,
+                              frame_header.upscaled_width, frame_header.height,
+                              sequence_header.color_config.subsampling_x,
+                              sequence_header.color_config.subsampling_y,
+                              /*left_border=*/kBorderPixels,
+                              /*right_border=*/kBorderPixels,
+                              /*top_border=*/kBorderPixels, bottom_border)) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
+    return kStatusOutOfMemory;
+  }
+  if (sequence_header.enable_cdef) {
+    if (!frame_scratch_buffer->cdef_index.Reset(
+            DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
+            DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+            /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index.");
+      return kStatusOutOfMemory;
+    }
+  }
+  if (!frame_scratch_buffer->inter_transform_sizes.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4,
+          /*zero_initialize=*/false)) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes.");
+    return kStatusOutOfMemory;
+  }
+  if (frame_header.use_ref_frame_mvs) {
+    if (!frame_scratch_buffer->motion_field.mv.Reset(
+            DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+            /*zero_initialize=*/false) ||
+        !frame_scratch_buffer->motion_field.reference_offset.Reset(
+            DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+            /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to allocate memory for temporal motion vectors.");
+      return kStatusOutOfMemory;
+    }
+
+    // For each motion vector, only mv[0] needs to be initialized to
+    // kInvalidMvValue, mv[1] is not necessary to be initialized and can be
+    // set to an arbitrary value. For simplicity, mv[1] is set to 0.
+    // The following memory initialization of contiguous memory is very fast. It
+    // is not recommended to make the initialization multi-threaded, unless the
+    // memory which needs to be initialized in each thread is still contiguous.
+    MotionVector invalid_mv;
+    invalid_mv.mv[0] = kInvalidMvValue;
+    invalid_mv.mv[1] = 0;
+    MotionVector* const motion_field_mv =
+        &frame_scratch_buffer->motion_field.mv[0][0];
+    std::fill(motion_field_mv,
+              motion_field_mv + frame_scratch_buffer->motion_field.mv.size(),
+              invalid_mv);
+  }
+
+  // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
+  // that the block parameters cache can be filled in for the last row/column
+  // without having to check for boundary conditions.
+  if (!frame_scratch_buffer->block_parameters_holder.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4,
+          sequence_header.use_128x128_superblock)) {
+    return kStatusOutOfMemory;
+  }
+  const dsp::Dsp* const dsp =
+      dsp::GetDspTable(sequence_header.color_config.bitdepth);
+  if (dsp == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.",
+                 sequence_header.color_config.bitdepth);
+    return kStatusInternalError;
+  }
+
+  const int tile_count = frame_header.tile_info.tile_count;
+  assert(tile_count >= 1);
+  Vector<std::unique_ptr<Tile>> tiles;
+  if (!tiles.reserve(tile_count)) {
+    LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
+    return kStatusOutOfMemory;
+  }
+
+  if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
+    if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+      frame_scratch_buffer->residual_buffer_pool.reset(
+          new (std::nothrow) ResidualBufferPool(
+              sequence_header.use_128x128_superblock,
+              sequence_header.color_config.subsampling_x,
+              sequence_header.color_config.subsampling_y,
+              sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+                                                         : sizeof(int32_t)));
+      if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+        LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n");
+        return kStatusOutOfMemory;
+      }
+    } else {
+      frame_scratch_buffer->residual_buffer_pool->Reset(
+          sequence_header.use_128x128_superblock,
+          sequence_header.color_config.subsampling_x,
+          sequence_header.color_config.subsampling_y,
+          sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+                                                     : sizeof(int32_t));
+    }
+  }
+
+  if (threading_strategy.post_filter_thread_pool() != nullptr && do_cdef) {
+    // We need to store 4 rows per 64x64 unit.
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_units| rows of the loop
+    // restoration border pixels.
+    if (!frame_scratch_buffer->cdef_border.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            MultiplyBy4(frame_header.columns4x4), num_units,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+            kBorderPixels, nullptr, nullptr, nullptr)) {
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (do_restoration &&
+      (do_cdef || threading_strategy.post_filter_thread_pool() != nullptr)) {
+    // We need to store 4 rows per 64x64 unit.
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_units| rows of the loop
+    // restoration border pixels.
+    if (!frame_scratch_buffer->loop_restoration_border.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            frame_header.upscaled_width, num_units,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+            kBorderPixels, nullptr, nullptr, nullptr)) {
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (do_superres) {
+    const int pixel_size = sequence_header.color_config.bitdepth == 8
+                               ? sizeof(uint8_t)
+                               : sizeof(uint16_t);
+    if (!frame_scratch_buffer->superres_coefficients[kPlaneTypeY].Resize(
+            kSuperResFilterTaps * Align(frame_header.upscaled_width, 16) *
+            pixel_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to Resize superres_coefficients[kPlaneTypeY].");
+      return kStatusOutOfMemory;
+    }
+    if (!sequence_header.color_config.is_monochrome &&
+        sequence_header.color_config.subsampling_x != 0 &&
+        !frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].Resize(
+            kSuperResFilterTaps *
+            Align(SubsampledValue(frame_header.upscaled_width, 1), 16) *
+            pixel_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to Resize superres_coefficients[kPlaneTypeUV].");
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (do_superres && threading_strategy.post_filter_thread_pool() != nullptr) {
+    const int num_threads =
+        threading_strategy.post_filter_thread_pool()->num_threads() + 1;
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_threads| rows of the
+    // down-scaled pixels.
+    // Left and right borders are for line extension. They are doubled for the Y
+    // plane to make sure the U and V planes have enough space after possible
+    // subsampling.
+    if (!frame_scratch_buffer->superres_line_buffer.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            MultiplyBy4(frame_header.columns4x4), num_threads,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+            2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+            nullptr, nullptr, nullptr)) {
+      LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
+      return kStatusOutOfMemory;
+    }
+  }
+
+  PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+                         current_frame->buffer(), dsp,
+                         settings_.post_filter_mask);
+
+  if (is_frame_parallel_ && !IsIntraFrame(frame_header.frame_type)) {
+    // We can parse the current frame if all the reference frames have been
+    // parsed.
+    for (const int index : frame_header.reference_frame_index) {
+      if (!state.reference_frame[index]->WaitUntilParsed()) {
+        return kStatusUnknownError;
+      }
+    }
+  }
+
+  // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+  // a segmentation map containing all 0s.
+  const SegmentationMap* prev_segment_ids = nullptr;
+  if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+    frame_scratch_buffer->symbol_decoder_context.Initialize(
+        frame_header.quantizer.base_index);
+  } else {
+    const int index =
+        frame_header
+            .reference_frame_index[frame_header.primary_reference_frame];
+    assert(index != -1);
+    const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+    frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+    if (frame_header.segmentation.enabled &&
+        prev_frame->columns4x4() == frame_header.columns4x4 &&
+        prev_frame->rows4x4() == frame_header.rows4x4) {
+      prev_segment_ids = prev_frame->segmentation_map();
+    }
+  }
+
+  // The Tile class must make use of a separate buffer to store the unfiltered
+  // pixels for the intra prediction of the next superblock row. This is done
+  // only when one of the following conditions are true:
+  //   * is_frame_parallel_ is true.
+  //   * settings_.threads == 1.
+  // In the non-frame-parallel multi-threaded case, we do not run the post
+  // filters in the decode loop. So this buffer need not be used.
+  const bool use_intra_prediction_buffer =
+      is_frame_parallel_ || settings_.threads == 1;
+  if (use_intra_prediction_buffer) {
+    if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+            frame_header.tile_info.tile_rows)) {
+      LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+      return kStatusOutOfMemory;
+    }
+    IntraPredictionBuffer* const intra_prediction_buffers =
+        frame_scratch_buffer->intra_prediction_buffers.get();
+    for (int plane = kPlaneY; plane < num_planes; ++plane) {
+      const int subsampling =
+          (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+      const size_t intra_prediction_buffer_size =
+          ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+           (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                       : sizeof(uint16_t)));
+      for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+           ++tile_row) {
+        if (!intra_prediction_buffers[tile_row][plane].Resize(
+                intra_prediction_buffer_size)) {
+          LIBGAV1_DLOG(ERROR,
+                       "Failed to allocate intra prediction buffer for tile "
+                       "row %d plane %d.\n",
+                       tile_row, plane);
+          return kStatusOutOfMemory;
+        }
+      }
+    }
+  }
+
+  SymbolDecoderContext saved_symbol_decoder_context;
+  BlockingCounterWithStatus pending_tiles(tile_count);
+  for (int tile_number = 0; tile_number < tile_count; ++tile_number) {
+    std::unique_ptr<Tile> tile = Tile::Create(
+        tile_number, tile_buffers[tile_number].data,
+        tile_buffers[tile_number].size, sequence_header, frame_header,
+        current_frame, state, frame_scratch_buffer, wedge_masks_,
+        quantizer_matrix_, &saved_symbol_decoder_context, prev_segment_ids,
+        &post_filter, dsp, threading_strategy.row_thread_pool(tile_number),
+        &pending_tiles, is_frame_parallel_, use_intra_prediction_buffer);
+    if (tile == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create tile.");
+      return kStatusOutOfMemory;
+    }
+    tiles.push_back_unchecked(std::move(tile));
+  }
+  assert(tiles.size() == static_cast<size_t>(tile_count));
+  if (is_frame_parallel_) {
+    if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+      return DecodeTilesFrameParallel(
+          sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+          prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+    }
+    return DecodeTilesThreadedFrameParallel(
+        sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+        prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+  }
+  StatusCode status;
+  if (settings_.threads == 1) {
+    status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
+                                         frame_scratch_buffer, &post_filter);
+  } else {
+    status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+                                                 &post_filter, &pending_tiles);
+  }
+  if (status != kStatusOk) return status;
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::ApplyFilmGrain(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const RefCountedBufferPtr& displayable_frame,
+    RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) {
+  if (!sequence_header.film_grain_params_present ||
+      !displayable_frame->film_grain_params().apply_grain ||
+      (settings_.post_filter_mask & 0x10) == 0) {
+    *film_grain_frame = displayable_frame;
+    return kStatusOk;
+  }
+  if (!frame_header.show_existing_frame &&
+      frame_header.refresh_frame_flags == 0) {
+    // If show_existing_frame is true, then the current frame is a previously
+    // saved reference frame. If refresh_frame_flags is nonzero, then the
+    // state_.UpdateReferenceFrames() call above has saved the current frame as
+    // a reference frame. Therefore, if both of these conditions are false, then
+    // the current frame is not saved as a reference frame. displayable_frame
+    // should hold the only reference to the current frame.
+    assert(displayable_frame.use_count() == 1);
+    // Add film grain noise in place.
+    *film_grain_frame = displayable_frame;
+  } else {
+    *film_grain_frame = buffer_pool_.GetFreeBuffer();
+    if (*film_grain_frame == nullptr) {
+      LIBGAV1_DLOG(ERROR,
+                   "Could not get film_grain_frame from the buffer pool.");
+      return kStatusResourceExhausted;
+    }
+    if (!(*film_grain_frame)
+             ->Realloc(displayable_frame->buffer()->bitdepth(),
+                       displayable_frame->buffer()->is_monochrome(),
+                       displayable_frame->upscaled_width(),
+                       displayable_frame->frame_height(),
+                       displayable_frame->buffer()->subsampling_x(),
+                       displayable_frame->buffer()->subsampling_y(),
+                       kBorderPixelsFilmGrain, kBorderPixelsFilmGrain,
+                       kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) {
+      LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
+      return kStatusOutOfMemory;
+    }
+    (*film_grain_frame)
+        ->set_chroma_sample_position(
+            displayable_frame->chroma_sample_position());
+    (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id());
+    (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id());
+  }
+  const bool color_matrix_is_identity =
+      sequence_header.color_config.matrix_coefficients ==
+      kMatrixCoefficientsIdentity;
+  assert(displayable_frame->buffer()->stride(kPlaneU) ==
+         displayable_frame->buffer()->stride(kPlaneV));
+  const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU);
+  assert((*film_grain_frame)->buffer()->stride(kPlaneU) ==
+         (*film_grain_frame)->buffer()->stride(kPlaneV));
+  const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (displayable_frame->buffer()->bitdepth() > 8) {
+    FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
+                             displayable_frame->buffer()->is_monochrome(),
+                             color_matrix_is_identity,
+                             displayable_frame->buffer()->subsampling_x(),
+                             displayable_frame->buffer()->subsampling_y(),
+                             displayable_frame->upscaled_width(),
+                             displayable_frame->frame_height(), thread_pool);
+    if (!film_grain.AddNoise(
+            displayable_frame->buffer()->data(kPlaneY),
+            displayable_frame->buffer()->stride(kPlaneY),
+            displayable_frame->buffer()->data(kPlaneU),
+            displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+            (*film_grain_frame)->buffer()->data(kPlaneY),
+            (*film_grain_frame)->buffer()->stride(kPlaneY),
+            (*film_grain_frame)->buffer()->data(kPlaneU),
+            (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+      LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+      return kStatusOutOfMemory;
+    }
+    return kStatusOk;
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+  FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
+                          displayable_frame->buffer()->is_monochrome(),
+                          color_matrix_is_identity,
+                          displayable_frame->buffer()->subsampling_x(),
+                          displayable_frame->buffer()->subsampling_y(),
+                          displayable_frame->upscaled_width(),
+                          displayable_frame->frame_height(), thread_pool);
+  if (!film_grain.AddNoise(
+          displayable_frame->buffer()->data(kPlaneY),
+          displayable_frame->buffer()->stride(kPlaneY),
+          displayable_frame->buffer()->data(kPlaneU),
+          displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+          (*film_grain_frame)->buffer()->data(kPlaneY),
+          (*film_grain_frame)->buffer()->stride(kPlaneY),
+          (*film_grain_frame)->buffer()->data(kPlaneU),
+          (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+    LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
+  if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(),
+                   [](const ObuHeader& obu_header) {
+                     return obu_header.type == kObuSequenceHeader;
+                   }) == obu.obu_headers().end()) {
+    return false;
+  }
+  const ObuSequenceHeader sequence_header = obu.sequence_header();
+  const bool sequence_header_changed =
+      !has_sequence_header_ ||
+      sequence_header_.color_config.bitdepth !=
+          sequence_header.color_config.bitdepth ||
+      sequence_header_.color_config.is_monochrome !=
+          sequence_header.color_config.is_monochrome ||
+      sequence_header_.color_config.subsampling_x !=
+          sequence_header.color_config.subsampling_x ||
+      sequence_header_.color_config.subsampling_y !=
+          sequence_header.color_config.subsampling_y ||
+      sequence_header_.max_frame_width != sequence_header.max_frame_width ||
+      sequence_header_.max_frame_height != sequence_header.max_frame_height;
+  sequence_header_ = sequence_header;
+  has_sequence_header_ = true;
+  return sequence_header_changed;
+}
+
+bool DecoderImpl::MaybeInitializeQuantizerMatrix(
+    const ObuFrameHeader& frame_header) {
+  if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
+    return true;
+  }
+  if (!InitializeQuantizerMatrix(&quantizer_matrix_)) {
+    return false;
+  }
+  quantizer_matrix_initialized_ = true;
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/decoder_impl.h b/src/decoder_impl.h
new file mode 100644
index 0000000..721b666
--- /dev/null
+++ b/src/decoder_impl.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_IMPL_H_
+#define LIBGAV1_SRC_DECODER_IMPL_H_
+
+#include <array>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/constants.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/decoder_settings.h"
+#include "src/gav1/status_code.h"
+#include "src/obu_parser.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/queue.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct TemporalUnit;
+
+struct EncodedFrame {
+  EncodedFrame(ObuParser* const obu, const DecoderState& state,
+               const RefCountedBufferPtr& frame, int position_in_temporal_unit)
+      : sequence_header(obu->sequence_header()),
+        frame_header(obu->frame_header()),
+        state(state),
+        temporal_unit(nullptr),
+        frame(frame),
+        position_in_temporal_unit(position_in_temporal_unit) {
+    obu->MoveTileBuffers(&tile_buffers);
+    frame->MarkFrameAsStarted();
+  }
+
+  const ObuSequenceHeader sequence_header;
+  const ObuFrameHeader frame_header;
+  Vector<TileBuffer> tile_buffers;
+  DecoderState state;
+  TemporalUnit* temporal_unit;
+  RefCountedBufferPtr frame;
+  const int position_in_temporal_unit;
+};
+
+struct TemporalUnit : public Allocable {
+  // The default constructor is invoked by the Queue<TemporalUnit>::Init()
+  // method. Queue<> does not use the default-constructed elements, so it is
+  // safe for the default constructor to not initialize the members.
+  TemporalUnit() = default;
+  TemporalUnit(const uint8_t* data, size_t size, int64_t user_private_data,
+               void* buffer_private_data)
+      : data(data),
+        size(size),
+        user_private_data(user_private_data),
+        buffer_private_data(buffer_private_data),
+        decoded(false),
+        status(kStatusOk),
+        has_displayable_frame(false),
+        output_frame_position(-1),
+        decoded_count(0),
+        output_layer_count(0),
+        released_input_buffer(false) {}
+
+  const uint8_t* data;
+  size_t size;
+  int64_t user_private_data;
+  void* buffer_private_data;
+
+  // The following members are used only in frame parallel mode.
+  bool decoded;
+  StatusCode status;
+  bool has_displayable_frame;
+  int output_frame_position;
+
+  Vector<EncodedFrame> frames;
+  size_t decoded_count;
+
+  // The struct (and the counter) is used to support output of multiple layers
+  // within a single temporal unit. The decoding process will store the output
+  // frames in |output_layers| in the order they are finished decoding. At the
+  // end of the decoding process, this array will be sorted in reverse order of
+  // |position_in_temporal_unit|. DequeueFrame() will then return the frames in
+  // reverse order (so that the entire process can run with a single counter
+  // variable).
+  struct OutputLayer {
+    // Used by std::sort to sort |output_layers| in reverse order of
+    // |position_in_temporal_unit|.
+    bool operator<(const OutputLayer& rhs) const {
+      return position_in_temporal_unit > rhs.position_in_temporal_unit;
+    }
+
+    RefCountedBufferPtr frame;
+    int position_in_temporal_unit = 0;
+  } output_layers[kMaxLayers];
+  // Number of entries in |output_layers|.
+  int output_layer_count;
+  // Flag to ensure that we release the input buffer only once if there are
+  // multiple output layers.
+  bool released_input_buffer;
+};
+
+class DecoderImpl : public Allocable {
+ public:
+  // The constructor saves a const reference to |*settings|. Therefore
+  // |*settings| must outlive the DecoderImpl object. On success, |*output|
+  // contains a pointer to the newly-created DecoderImpl object. On failure,
+  // |*output| is not modified.
+  static StatusCode Create(const DecoderSettings* settings,
+                           std::unique_ptr<DecoderImpl>* output);
+  ~DecoderImpl();
+  StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+                          int64_t user_private_data, void* buffer_private_data);
+  StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+  static constexpr int GetMaxBitdepth() {
+    static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10,
+                  "LIBGAV1_MAX_BITDEPTH must be 8 or 10.");
+    return LIBGAV1_MAX_BITDEPTH;
+  }
+
+ private:
+  explicit DecoderImpl(const DecoderSettings* settings);
+  StatusCode Init();
+  // Called when the first frame is enqueued. It does the OBU parsing for one
+  // temporal unit to retrieve the tile configuration and sets up the frame
+  // threading if frame parallel mode is allowed. It also initializes the
+  // |temporal_units_| queue based on the number of frame threads.
+  //
+  // The following are the limitations of the current implementation:
+  //  * It assumes that all frames in the video have the same tile
+  //    configuration. The frame parallel threading model will not be updated
+  //    based on tile configuration changes mid-stream.
+  //  * The above assumption holds true even when there is a new coded video
+  //    sequence (i.e.) a new sequence header.
+  StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+                                                           size_t size);
+  // Used only in frame parallel mode. Signals failure and waits until the
+  // worker threads are aborted if |status| is a failure status. If |status| is
+  // equal to kStatusOk or kStatusTryAgain, this function does not do anything.
+  // Always returns the input parameter |status| as the return value.
+  //
+  // This function is called only from the application thread (from
+  // EnqueueFrame() and DequeueFrame()).
+  StatusCode SignalFailure(StatusCode status);
+
+  void ReleaseOutputFrame();
+
+  // Decodes all the frames contained in the given temporal unit. Used only in
+  // non frame parallel mode.
+  StatusCode DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+                                const DecoderBuffer** out_ptr);
+  // Used only in frame parallel mode. Does the OBU parsing for |data| and
+  // schedules the individual frames for decoding in the |frame_thread_pool_|.
+  StatusCode ParseAndSchedule(const uint8_t* data, size_t size,
+                              int64_t user_private_data,
+                              void* buffer_private_data);
+  // Decodes the |encoded_frame| and updates the
+  // |encoded_frame->temporal_unit|'s parameters if the decoded frame is a
+  // displayable frame. Used only in frame parallel mode.
+  StatusCode DecodeFrame(EncodedFrame* encoded_frame);
+
+  // Populates |buffer_| with values from |frame|. Adds a reference to |frame|
+  // in |output_frame_|.
+  StatusCode CopyFrameToOutputBuffer(const RefCountedBufferPtr& frame);
+  StatusCode DecodeTiles(const ObuSequenceHeader& sequence_header,
+                         const ObuFrameHeader& frame_header,
+                         const Vector<TileBuffer>& tile_buffers,
+                         const DecoderState& state,
+                         FrameScratchBuffer* frame_scratch_buffer,
+                         RefCountedBuffer* current_frame);
+  // Applies film grain synthesis to the |displayable_frame| and stores the film
+  // grain applied frame into |film_grain_frame|. Returns kStatusOk on success.
+  StatusCode ApplyFilmGrain(const ObuSequenceHeader& sequence_header,
+                            const ObuFrameHeader& frame_header,
+                            const RefCountedBufferPtr& displayable_frame,
+                            RefCountedBufferPtr* film_grain_frame,
+                            ThreadPool* thread_pool);
+
+  bool IsNewSequenceHeader(const ObuParser& obu);
+
+  bool HasFailure() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return failure_status_ != kStatusOk;
+  }
+
+  // Initializes the |quantizer_matrix_| if necessary and sets
+  // |quantizer_matrix_initialized_| to true.
+  bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
+
+  // Elements in this queue cannot be moved with std::move since the
+  // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
+  Queue<TemporalUnit> temporal_units_;
+  DecoderState state_;
+
+  DecoderBuffer buffer_ = {};
+  // |output_frame_| holds a reference to the output frame on behalf of
+  // |buffer_|.
+  RefCountedBufferPtr output_frame_;
+
+  // Queue of output frames that are to be returned in the DequeueFrame() calls.
+  // If |settings_.output_all_layers| is false, this queue will never contain
+  // more than 1 element. This queue is used only when |is_frame_parallel_| is
+  // false.
+  Queue<RefCountedBufferPtr> output_frame_queue_;
+
+  BufferPool buffer_pool_;
+  WedgeMaskArray wedge_masks_;
+  QuantizerMatrix quantizer_matrix_;
+  bool quantizer_matrix_initialized_ = false;
+  FrameScratchBufferPool frame_scratch_buffer_pool_;
+
+  // Used to synchronize the accesses into |temporal_units_| in order to update
+  // the "decoded" state of an temporal unit.
+  std::mutex mutex_;
+  std::condition_variable decoded_condvar_;
+  bool is_frame_parallel_;
+  std::unique_ptr<ThreadPool> frame_thread_pool_;
+
+  // In frame parallel mode, there are two primary points of failure:
+  //  1) ParseAndSchedule()
+  //  2) DecodeTiles()
+  // Both of these functions have to respond to the other one failing by
+  // aborting whatever they are doing. This variable is used to accomplish that.
+  // If |failure_status_| is not kStatusOk, then the two functions will try to
+  // abort as early as they can.
+  StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
+
+  ObuSequenceHeader sequence_header_ = {};
+  // If true, sequence_header is valid.
+  bool has_sequence_header_ = false;
+
+  const DecoderSettings& settings_;
+  bool seen_first_frame_ = false;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DECODER_IMPL_H_
diff --git a/src/decoder_settings.cc b/src/decoder_settings.cc
new file mode 100644
index 0000000..9399073
--- /dev/null
+++ b/src/decoder_settings.cc
@@ -0,0 +1,33 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_settings.h"
+
+extern "C" {
+
+void Libgav1DecoderSettingsInitDefault(Libgav1DecoderSettings* settings) {
+  settings->threads = 1;
+  settings->frame_parallel = 0;    // false
+  settings->blocking_dequeue = 0;  // false
+  settings->on_frame_buffer_size_changed = nullptr;
+  settings->get_frame_buffer = nullptr;
+  settings->release_frame_buffer = nullptr;
+  settings->release_input_buffer = nullptr;
+  settings->callback_private_data = nullptr;
+  settings->output_all_layers = 0;  // false
+  settings->operating_point = 0;
+  settings->post_filter_mask = 0x1f;
+}
+
+}  // extern "C"
diff --git a/src/decoder_state.h b/src/decoder_state.h
new file mode 100644
index 0000000..897c99f
--- /dev/null
+++ b/src/decoder_state.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_STATE_H_
+#define LIBGAV1_SRC_DECODER_STATE_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+struct DecoderState {
+  // Section 7.20. Updates frames in the reference_frame array with
+  // |current_frame|, based on the |refresh_frame_flags| bitmask.
+  void UpdateReferenceFrames(const RefCountedBufferPtr& current_frame,
+                             int refresh_frame_flags) {
+    for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
+         ++ref_index, mask >>= 1) {
+      if ((mask & 1) != 0) {
+        reference_valid[ref_index] = true;
+        reference_frame_id[ref_index] = current_frame_id;
+        reference_frame[ref_index] = current_frame;
+        reference_order_hint[ref_index] = order_hint;
+      }
+    }
+  }
+
+  // Clears all the reference frames.
+  void ClearReferenceFrames() {
+    reference_valid = {};
+    reference_frame_id = {};
+    reference_order_hint = {};
+    for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
+      reference_frame[ref_index] = nullptr;
+    }
+  }
+
+  // reference_valid and reference_frame_id are used only if
+  // sequence_header_.frame_id_numbers_present is true.
+  // The reference_valid array is indexed by a reference picture slot number.
+  // A value (boolean) in the array signifies whether the corresponding
+  // reference picture slot is valid for use as a reference picture.
+  std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
+  std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
+  // A valid value of current_frame_id is an unsigned integer of at most 16
+  // bits. -1 indicates current_frame_id is not initialized.
+  int current_frame_id = -1;
+  // The RefOrderHint array variable in the spec.
+  std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
+  // The OrderHint variable in the spec. Its value comes from either the
+  // order_hint syntax element in the uncompressed header (if
+  // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
+  // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
+  // 5.9.2 and Section 7.4.
+  //
+  // NOTE: When show_existing_frame is false, it is often more convenient to
+  // just use the order_hint field of the frame header as OrderHint. So this
+  // field is mainly used to update the reference_order_hint array in
+  // UpdateReferenceFrames().
+  uint8_t order_hint = 0;
+  // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
+  // of the motion vector in time for each reference frame.
+  // * |false| indicates that the reference frame is a forwards reference (i.e.
+  //   the reference frame is expected to be output before the current frame);
+  // * |true| indicates that the reference frame is a backwards reference.
+  // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
+  std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+  std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DECODER_STATE_H_
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
new file mode 100644
index 0000000..834e8b4
--- /dev/null
+++ b/src/dsp/arm/average_blend_neon.cc
@@ -0,0 +1,146 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit =
+    kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
+                                  const int16_t* prediction_1) {
+  const int16x8_t pred0 = vld1q_s16(prediction_0);
+  const int16x8_t pred1 = vld1q_s16(prediction_1);
+  const int16x8_t res = vaddq_s16(pred0, pred1);
+  return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
+}
+
+inline void AverageBlendLargeRow(const int16_t* prediction_0,
+                                 const int16_t* prediction_1, const int width,
+                                 uint8_t* dest) {
+  int x = width;
+  do {
+    const int16x8_t pred_00 = vld1q_s16(prediction_0);
+    const int16x8_t pred_01 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
+    const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
+    const int16x8_t pred_10 = vld1q_s16(prediction_0);
+    const int16x8_t pred_11 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
+    const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
+    vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
+    dest += 16;
+    x -= 16;
+  } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+                       const int width, const int height, void* const dest,
+                       const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = height;
+
+  if (width == 4) {
+    do {
+      const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
+      pred_0 += 8;
+      pred_1 += 8;
+
+      StoreLo4(dst, result);
+      dst += dest_stride;
+      StoreHi4(dst, result);
+      dst += dest_stride;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+      dst += dest_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+      dst += dest_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->average_blend = AverageBlend_NEON;
+}
+
+}  // namespace
+
+void AverageBlendInit_NEON() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/average_blend_neon.h b/src/dsp/arm/average_blend_neon.h
new file mode 100644
index 0000000..d13bcd6
--- /dev/null
+++ b/src/dsp/arm/average_blend_neon.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
new file mode 100644
index 0000000..4d0e76f
--- /dev/null
+++ b/src/dsp/arm/cdef_neon.cc
@@ -0,0 +1,697 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  // 00 01 02 03 04 05 06 07
+  // 00 10 11 12 13 14 15 16
+  *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6));
+  // 17 00 00 00 00 00 00 00
+  // 26 27 00 00 00 00 00 00
+  *partial_hi =
+      vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  uint8x16_t v_d1_temp[8];
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  const uint8x16_t v_zero_16 = vdupq_n_u8(0);
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = vcombine_u8(v_src[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = vdupq_n_u16(0);
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  const uint16x8_t v_zero = vdupq_n_u16(0);
+  uint16x8_t v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]);
+  v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]);
+  v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]);
+  v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = vdupq_n_u16(0);
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
+                                      ptrdiff_t stride, uint16x8_t* partial_lo,
+                                      uint16x8_t* partial_hi) {
+  const auto* src = static_cast<const uint8_t*>(source);
+
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  uint8x8_t v_src[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = vld1_u8(src);
+    src += stride;
+  }
+
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  00 00 00 00 00 00 00 00
+  // 01 11 21 33 41 51 61 71  00 00 00 00 00 00 00 00
+  // 02 12 22 33 42 52 62 72  00 00 00 00 00 00 00 00
+  // 03 13 23 33 43 53 63 73  00 00 00 00 00 00 00 00
+  // 04 14 24 34 44 54 64 74  00 00 00 00 00 00 00 00
+  // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
+  // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
+  // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7);
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00 00
+  // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00 00
+  // 20 21 22 23 24 25 26 27  00 00 00 00 00 00 00 00
+  // 30 31 32 33 34 35 36 37  00 00 00 00 00 00 00 00
+  // 40 41 42 43 44 45 46 47  00 00 00 00 00 00 00 00
+  // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
+  // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
+  // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
+  for (int i = 1; i < 8; ++i) {
+    partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
+  }
+
+  // partial for direction 0
+  AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]);
+
+  // partial for direction 1
+  AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]);
+
+  // partial for direction 7
+  AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]);
+
+  uint8x8_t v_src_reverse[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src_reverse[i] = vrev64_u8(v_src[i]);
+  }
+
+  // partial for direction 4
+  AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+  // partial for direction 3
+  AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+  // partial for direction 5
+  AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); }
+
+uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
+  return vmlal_u16(a, b, b);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+// Because everything is being summed into a single value the distributive
+// property allows us to mirror the division table and accumulate once.
+uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
+                  const uint32x4_t division_table[4]) {
+  uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]);
+  c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]);
+  c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]);
+  c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]);
+  return SumVector(c);
+}
+
+// |cost[2]| and |cost[6]| square the input and accumulate:
+// cost[2] += Square(partial[2][i])
+uint32_t SquareAccumulate(const uint16x8_t a) {
+  uint32x4_t c = Square(vget_low_u16(a));
+  c = SquareAccumulate(c, vget_high_u16(a));
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+  return SumVector(c);
+}
+
+uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
+                 const uint32x4_t division_table[2]) {
+  // Remove elements 0-2.
+  uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
+  c = vaddq_u32(c, Square(vget_high_u16(a)));
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+
+  c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
+  c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
+  return SumVector(c);
+}
+
+void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
+                        uint8_t* const direction, int* const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+  uint16x8_t partial_lo[8], partial_hi[8];
+
+  AddPartial(src, stride, partial_lo, partial_hi);
+
+  cost[2] = SquareAccumulate(partial_lo[2]);
+  cost[6] = SquareAccumulate(partial_lo[6]);
+
+  const uint32x4_t division_table[4] = {
+      vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+      vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
+
+  cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+  cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+  const uint32x4_t division_table_odd[2] = {
+      vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
+
+  const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
+
+  cost[1] =
+      CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd);
+  cost[3] =
+      CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd);
+  cost[5] =
+      CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd);
+  cost[7] =
+      CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+                   uint16x8_t* output, const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = vld1q_u16(src + y_0 * stride + x_0);
+  output[1] = vld1q_u16(src - y_0 * stride - x_0);
+  output[2] = vld1q_u16(src + y_1 * stride + x_1);
+  output[3] = vld1q_u16(src - y_1 * stride - x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+                    uint16x8_t* output, const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0),
+                           vld1_u16(src + y_0 * stride + stride + x_0));
+  output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0),
+                           vld1_u16(src - y_0 * stride + stride - x_0));
+  output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1),
+                           vld1_u16(src + y_1 * stride + stride + x_1));
+  output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1),
+                           vld1_u16(src - y_1 * stride + stride - x_1));
+}
+
+int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
+                    const uint16x8_t threshold, const int16x8_t damping) {
+  // If reference > pixel, the difference will be negative, so covert to 0 or
+  // -1.
+  const uint16x8_t sign = vcgtq_u16(reference, pixel);
+  const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
+  const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const uint16x8_t thresh_minus_shifted_diff =
+      vqsubq_u16(threshold, shifted_diff);
+  const uint16x8_t clamp_abs_diff =
+      vminq_u16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return vreinterpretq_s16_u16(
+      vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign));
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_NEON(const uint16_t* src, const ptrdiff_t src_stride,
+                     const int height, const int primary_strength,
+                     const int secondary_strength, const int damping,
+                     const int direction, void* dest,
+                     const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint16x8_t cdef_large_value_mask =
+      vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
+  const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength);
+  const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength);
+
+  int16x8_t primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
+  }
+
+  const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0];
+  const int primary_tap_1 = kCdefPrimaryTaps[primary_strength & 1][1];
+
+  int y = height;
+  do {
+    uint16x8_t pixel;
+    if (width == 8) {
+      pixel = vld1q_u16(src);
+    } else {
+      pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
+    }
+
+    uint16x8_t min = pixel;
+    uint16x8_t max = pixel;
+    int16x8_t sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      uint16x8_t primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      if (clipping_required) {
+        min = vminq_u16(min, primary_val[0]);
+        min = vminq_u16(min, primary_val[1]);
+        min = vminq_u16(min, primary_val[2]);
+        min = vminq_u16(min, primary_val[3]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const uint8x16_t max_p01 =
+            vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]),
+                     vreinterpretq_u8_u16(primary_val[1]));
+        const uint8x16_t max_p23 =
+            vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]),
+                     vreinterpretq_u8_u16(primary_val[3]));
+        const uint16x8_t max_p =
+            vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23));
+        max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask));
+      }
+
+      sum = Constrain(primary_val[0], pixel, primary_threshold,
+                      primary_damping_shift);
+      sum = vmulq_n_s16(sum, primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[1], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[2], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[3], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+    } else {
+      sum = vdupq_n_s16(0);
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      uint16x8_t secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      if (clipping_required) {
+        min = vminq_u16(min, secondary_val[0]);
+        min = vminq_u16(min, secondary_val[1]);
+        min = vminq_u16(min, secondary_val[2]);
+        min = vminq_u16(min, secondary_val[3]);
+        min = vminq_u16(min, secondary_val[4]);
+        min = vminq_u16(min, secondary_val[5]);
+        min = vminq_u16(min, secondary_val[6]);
+        min = vminq_u16(min, secondary_val[7]);
+
+        const uint8x16_t max_s01 =
+            vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]),
+                     vreinterpretq_u8_u16(secondary_val[1]));
+        const uint8x16_t max_s23 =
+            vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]),
+                     vreinterpretq_u8_u16(secondary_val[3]));
+        const uint8x16_t max_s45 =
+            vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]),
+                     vreinterpretq_u8_u16(secondary_val[5]));
+        const uint8x16_t max_s67 =
+            vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]),
+                     vreinterpretq_u8_u16(secondary_val[7]));
+        const uint16x8_t max_s = vreinterpretq_u16_u8(
+            vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67)));
+        max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask));
+      }
+
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[0], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[1], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[2], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[3], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[4], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[5], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[6], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[7], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+    }
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
+    sum = vaddq_s16(sum, sum_lt_0);
+    int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4);
+    if (clipping_required) {
+      result = vminq_s16(result, vreinterpretq_s16_u16(max));
+      result = vmaxq_s16(result, vreinterpretq_s16_u16(min));
+    }
+
+    const uint8x8_t dst_pixel = vqmovun_s16(result);
+    if (width == 8) {
+      src += src_stride;
+      vst1_u8(dst, dst_pixel);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      StoreLo4(dst, dst_pixel);
+      dst += dst_stride;
+      StoreHi4(dst, dst_pixel);
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_NEON;
+  dsp->cdef_filters[0][0] = CdefFilter_NEON<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_NEON<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_NEON<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_NEON<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_NEON<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_NEON<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/cdef_neon.h b/src/dsp/arm/cdef_neon.h
new file mode 100644
index 0000000..53d5f86
--- /dev/null
+++ b/src/dsp/arm/cdef_neon.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
new file mode 100644
index 0000000..dcb7567
--- /dev/null
+++ b/src/dsp/arm/common_neon.h
@@ -0,0 +1,777 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cstdint>
+#include <cstring>
+
+#if 0
+#include <cstdio>
+
+#include "absl/strings/str_cat.h"
+
+constexpr bool kEnablePrintRegs = true;
+
+union DebugRegister {
+  int8_t i8[8];
+  int16_t i16[4];
+  int32_t i32[2];
+  uint8_t u8[8];
+  uint16_t u16[4];
+  uint32_t u32[2];
+};
+
+union DebugRegisterQ {
+  int8_t i8[16];
+  int16_t i16[8];
+  int32_t i32[4];
+  uint8_t u8[16];
+  uint16_t u16[8];
+  uint32_t u32[4];
+};
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintVect(const DebugRegister r, const char* const name, int size) {
+  int n;
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s\t: ", name);
+    if (size == 8) {
+      for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+    } else if (size == 16) {
+      for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+    } else if (size == 32) {
+      for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+// Debugging macro for 128-bit types.
+inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
+                       int size) {
+  int n;
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s\t: ", name);
+    if (size == 8) {
+      for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+    } else if (size == 16) {
+      for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+    } else if (size == 32) {
+      for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+inline void PrintReg(const int32x4x2_t val, const std::string& name) {
+  DebugRegisterQ r;
+  vst1q_u32(r.u32, val.val[0]);
+  const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+  PrintVectQ(r, name0.c_str(), 32);
+  vst1q_u32(r.u32, val.val[1]);
+  const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+  PrintVectQ(r, name1.c_str(), 32);
+}
+
+inline void PrintReg(const uint32x4_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u32(r.u32, val);
+  PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const uint32x2_t val, const char* name) {
+  DebugRegister r;
+  vst1_u32(r.u32, val);
+  PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const uint16x8_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u16(r.u16, val);
+  PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const uint16x4_t val, const char* name) {
+  DebugRegister r;
+  vst1_u16(r.u16, val);
+  PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const uint8x16_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u8(r.u8, val);
+  PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const uint8x8_t val, const char* name) {
+  DebugRegister r;
+  vst1_u8(r.u8, val);
+  PrintVect(r, name, 8);
+}
+
+inline void PrintReg(const int32x4_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s32(r.i32, val);
+  PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const int32x2_t val, const char* name) {
+  DebugRegister r;
+  vst1_s32(r.i32, val);
+  PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const int16x8_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s16(r.i16, val);
+  PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const int16x4_t val, const char* name) {
+  DebugRegister r;
+  vst1_s16(r.i16, val);
+  PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const int8x16_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s8(r.i8, val);
+  PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const int8x8_t val, const char* name) {
+  DebugRegister r;
+  vst1_s8(r.i8, val);
+  PrintVect(r, name, 8);
+}
+
+// Print an individual (non-vector) value in decimal format.
+inline void PrintReg(const int x, const char* name) {
+  if (kEnablePrintRegs) {
+    printf("%s: %d\n", name, x);
+  }
+}
+
+// Print an individual (non-vector) value in hexadecimal format.
+inline void PrintHex(const int x, const char* name) {
+  if (kEnablePrintRegs) {
+    printf("%s: %x\n", name, x);
+  }
+}
+
+#define PR(x) PrintReg(x, #x)
+#define PD(x) PrintReg(x, #x)
+#define PX(x) PrintHex(x, #x)
+
+#endif  // 0
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+// Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
+// the values. Use caution when using this in loops because it will re-zero the
+// register before loading on every iteration.
+inline uint8x8_t Load2(const void* const buf) {
+  const uint16x4_t zero = vdup_n_u16(0);
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return vreinterpret_u8_u16(
+      vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
+}
+
+// Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
+// register before loading the values. Use caution when using this in loops
+// because it will re-zero the register before loading on every iteration.
+inline uint8x8_t Load4(const void* const buf) {
+  const uint32x2_t zero = vdup_n_u32(0);
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
+}
+
+// Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
+template <int lane>
+inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u8_u32(
+      vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of the type (4 bytes in the case of uint32_t)
+// and add alignment hints to the memory access.
+template <typename T>
+inline void ValueToMem(void* const buf, T val) {
+  memcpy(buf, &val, sizeof(val));
+}
+
+// Store 4 int8_t values from the low half of an int8x8_t register.
+inline void StoreLo4(void* const buf, const int8x8_t val) {
+  ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
+}
+
+// Store 4 uint8_t values from the low half of a uint8x8_t register.
+inline void StoreLo4(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
+}
+
+// Store 4 uint8_t values from the high half of a uint8x8_t register.
+inline void StoreHi4(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+}
+
+// Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint16x8_t val) {
+  ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
+// register.
+template <int lane>
+inline void Store2(uint16_t* const buf, const uint16x4_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
+}
+
+//------------------------------------------------------------------------------
+// Bit manipulation.
+
+// vshXX_n_XX() requires an immediate.
+template <int shift>
+inline uint8x8_t LeftShift(const uint8x8_t vector) {
+  return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline uint8x8_t RightShift(const uint8x8_t vector) {
+  return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline int8x8_t RightShift(const int8x8_t vector) {
+  return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
+}
+
+// Shim vqtbl1_u8 for armv7.
+inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl1_u8(a, index);
+#else
+  const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)};
+  return vtbl2_u8(b, index);
+#endif
+}
+
+// Shim vqtbl1_s8 for armv7.
+inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl1_s8(a, index);
+#else
+  const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
+  return vtbl2_s8(b, vreinterpret_s8_u8(index));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Interleave.
+
+// vzipN is exclusive to A64.
+inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vzip1_u8(a, b);
+#else
+  // Discard |.val[1]|
+  return vzip_u8(a, b).val[0];
+#endif
+}
+
+inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_u8_u32(
+      vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+  // Discard |.val[1]|
+  return vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]);
+#endif
+}
+
+inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_s8_u32(
+      vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+  // Discard |.val[1]|
+  return vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]);
+#endif
+}
+
+inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_u8_u32(
+      vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+  // Discard |.val[0]|
+  return vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]);
+#endif
+}
+
+inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_s8_u32(
+      vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+  // Discard |.val[0]|
+  return vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Sum.
+
+inline uint16_t SumVector(const uint8x8_t a) {
+#if defined(__aarch64__)
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t c = vpaddl_u8(a);
+  const uint32x2_t d = vpaddl_u16(c);
+  const uint64x1_t e = vpaddl_u32(d);
+  return static_cast<uint16_t>(vget_lane_u64(e, 0));
+#endif  // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+  return static_cast<uint32_t>(vget_lane_u64(c, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Transpose.
+
+// Transpose 32 bit elements such that:
+// a: 00 01
+// b: 02 03
+// returns
+// val[0]: 00 02
+// val[1]: 01 03
+inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) {
+  const uint32x2_t a_32 = vreinterpret_u32_u8(a);
+  const uint32x2_t b_32 = vreinterpret_u32_u8(b);
+  const uint32x2x2_t c = vtrn_u32(a_32, b_32);
+  const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]),
+                         vreinterpret_u8_u32(c.val[1])};
+  return d;
+}
+
+// Swap high and low 32 bit elements.
+inline uint8x8_t Transpose32(const uint8x8_t a) {
+  const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a));
+  return vreinterpret_u8_u32(b);
+}
+
+// Implement vtrnq_s64().
+// Input:
+// a0: 00 01 02 03 04 05 06 07
+// a1: 16 17 18 19 20 21 22 23
+// Output:
+// b0.val[0]: 00 01 02 03 16 17 18 19
+// b0.val[1]: 04 05 06 07 20 21 22 23
+inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
+  int16x8x2_t b0;
+  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+                           vreinterpret_s16_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+                           vreinterpret_s16_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
+  uint16x8x2_t b0;
+  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+                           vreinterpret_u16_u32(vget_low_u32(a1)));
+  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+                           vreinterpret_u16_u32(vget_high_u32(a1)));
+  return b0;
+}
+
+// Input:
+// a: 00 01 02 03 10 11 12 13
+// b: 20 21 22 23 30 31 32 33
+// Output:
+// Note that columns [1] and [2] are transposed.
+// a: 00 10 20 30 02 12 22 32
+// b: 01 11 21 31 03 13 23 33
+inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
+  const uint16x4x2_t c =
+      vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b));
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1]));
+  const uint8x8x2_t e =
+      vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1]));
+  *a = e.val[0];
+  *b = e.val[1];
+}
+
+// Reversible if the x4 values are packed next to each other.
+// x4 input / x8 output:
+// a0: 00 01 02 03 40 41 42 43 44
+// a1: 10 11 12 13 50 51 52 53 54
+// a2: 20 21 22 23 60 61 62 63 64
+// a3: 30 31 32 33 70 71 72 73 74
+// x8 input / x4 output:
+// a0: 00 10 20 30 40 50 60 70
+// a1: 01 11 21 31 41 51 61 71
+// a2: 02 12 22 32 42 52 62 72
+// a3: 03 13 23 33 43 53 63 73
+inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2,
+                         uint8x8_t* a3) {
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int8x8_t a[8]) {
+  // Swap 8 bit elements. Goes from:
+  // a[0]: 00 01 02 03 04 05 06 07
+  // a[1]: 10 11 12 13 14 15 16 17
+  // a[2]: 20 21 22 23 24 25 26 27
+  // a[3]: 30 31 32 33 34 35 36 37
+  // a[4]: 40 41 42 43 44 45 46 47
+  // a[5]: 50 51 52 53 54 55 56 57
+  // a[6]: 60 61 62 63 64 65 66 67
+  // a[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+  const int8x16x2_t b0 =
+      vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5]));
+  const int8x16x2_t b1 =
+      vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7]));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+  const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]),
+                                   vreinterpretq_s16_s8(b1.val[0]));
+  const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]),
+                                   vreinterpretq_s16_s8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+
+  a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0]));
+  a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0]));
+  a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0]));
+  a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0]));
+  a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1]));
+  a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1]));
+  a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1]));
+  a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1]));
+}
+
+// Unsigned.
+inline void Transpose8x8(uint8x8_t a[8]) {
+  const uint8x16x2_t b0 =
+      vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5]));
+  const uint8x16x2_t b1 =
+      vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7]));
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
+  const uint8x16x2_t a0 =
+      vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
+  const uint8x16x2_t a1 =
+      vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
+
+  const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
+                                    vreinterpretq_u16_u8(a1.val[0]));
+  const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
+                                    vreinterpretq_u16_u8(a1.val[1]));
+
+  const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  out[0] = vreinterpretq_u8_u32(c0.val[0]);
+  out[1] = vreinterpretq_u8_u32(c1.val[0]);
+  out[2] = vreinterpretq_u8_u32(c0.val[1]);
+  out[3] = vreinterpretq_u8_u32(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int16x8_t a[8]) {
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+  a[0] = d0.val[0];
+  a[1] = d1.val[0];
+  a[2] = d2.val[0];
+  a[3] = d3.val[0];
+  a[4] = d0.val[1];
+  a[5] = d1.val[1];
+  a[6] = d2.val[1];
+  a[7] = d3.val[1];
+}
+
+// Unsigned.
+inline void Transpose8x8(uint16x8_t a[8]) {
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+  const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]);
+  const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]);
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]);
+  const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]);
+  const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]);
+  const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]);
+
+  a[0] = d0.val[0];
+  a[1] = d1.val[0];
+  a[2] = d2.val[0];
+  a[3] = d3.val[0];
+  a[4] = d0.val[1];
+  a[5] = d1.val[1];
+  a[6] = d2.val[1];
+  a[7] = d3.val[1];
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07  80 81 82 83 84 85 86 87
+// a[1]: 10 11 12 13 14 15 16 17  90 91 92 93 94 95 96 97
+// a[2]: 20 21 22 23 24 25 26 27  a0 a1 a2 a3 a4 a5 a6 a7
+// a[3]: 30 31 32 33 34 35 36 37  b0 b1 b2 b3 b4 b5 b6 b7
+// a[4]: 40 41 42 43 44 45 46 47  c0 c1 c2 c3 c4 c5 c6 c7
+// a[5]: 50 51 52 53 54 55 56 57  d0 d1 d2 d3 d4 d5 d6 d7
+// a[6]: 60 61 62 63 64 65 66 67  e0 e1 e2 e3 e4 e5 e6 e7
+// a[7]: 70 71 72 73 74 75 76 77  f0 f1 f2 f3 f4 f5 f6 f7
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
+// a[1]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
+// a[2]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
+// a[3]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
+// a[4]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
+// a[5]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
+// a[6]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
+// a[7]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
+inline void Transpose8x16(uint8x16_t a[8]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
+  // b0.val[1]: 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
+  // b1.val[0]: 20 30 22 32 24 34 26 36  a0 b0 a2 b2 a4 b4 a6 b6
+  // b1.val[1]: 21 31 23 33 25 35 27 37  a1 b1 a3 b3 a5 b5 a7 b7
+  // b2.val[0]: 40 50 42 52 44 54 46 56  c0 d0 c2 d2 c4 d4 c6 d6
+  // b2.val[1]: 41 51 43 53 45 55 47 57  c1 d1 c3 d3 c5 d5 c7 d7
+  // b3.val[0]: 60 70 62 72 64 74 66 76  e0 f0 e2 f2 e4 f4 e6 f6
+  // b3.val[1]: 61 71 63 73 65 75 67 77  e1 f1 e3 f3 e5 f5 e7 f7
+  const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
+  const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
+  const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
+  const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34  80 90 a0 b0 84 94 a4 b4
+  // c0.val[1]: 02 12 22 32 06 16 26 36  82 92 a2 b2 86 96 a6 b6
+  // c1.val[0]: 01 11 21 31 05 15 25 35  81 91 a1 b1 85 95 a5 b5
+  // c1.val[1]: 03 13 23 33 07 17 27 37  83 93 a3 b3 87 97 a7 b7
+  // c2.val[0]: 40 50 60 70 44 54 64 74  c0 d0 e0 f0 c4 d4 e4 f4
+  // c2.val[1]: 42 52 62 72 46 56 66 76  c2 d2 e2 f2 c6 d6 e6 f6
+  // c3.val[0]: 41 51 61 71 45 55 65 75  c1 d1 e1 f1 c5 d5 e5 f5
+  // c3.val[1]: 43 53 63 73 47 57 67 77  c3 d3 e3 f3 c7 d7 e7 f7
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+  const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+                                    vreinterpretq_u16_u8(b3.val[0]));
+  const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+                                    vreinterpretq_u16_u8(b3.val[1]));
+
+  // d0.val[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
+  // d0.val[1]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
+  // d1.val[0]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
+  // d1.val[1]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
+  // d2.val[0]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
+  // d2.val[1]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
+  // d3.val[0]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
+  // d3.val[1]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
+  const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c2.val[0]));
+  const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+                                    vreinterpretq_u32_u16(c3.val[0]));
+  const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c2.val[1]));
+  const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+                                    vreinterpretq_u32_u16(c3.val[1]));
+
+  a[0] = vreinterpretq_u8_u32(d0.val[0]);
+  a[1] = vreinterpretq_u8_u32(d1.val[0]);
+  a[2] = vreinterpretq_u8_u32(d2.val[0]);
+  a[3] = vreinterpretq_u8_u32(d3.val[0]);
+  a[4] = vreinterpretq_u8_u32(d0.val[1]);
+  a[5] = vreinterpretq_u8_u32(d1.val[1]);
+  a[6] = vreinterpretq_u8_u32(d2.val[1]);
+  a[7] = vreinterpretq_u8_u32(d3.val[1]);
+}
+
+inline int16x8_t ZeroExtend(const uint8x8_t in) {
+  return vreinterpretq_s16_u16(vmovl_u8(in));
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_ENABLE_NEON
+#endif  // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
new file mode 100644
index 0000000..fd9b912
--- /dev/null
+++ b/src/dsp/arm/convolve_neon.cc
@@ -0,0 +1,3105 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index, bool negative_outside_taps = false>
+int16x8_t SumOnePassTaps(const uint8x8_t* const src,
+                         const uint8x8_t* const taps) {
+  uint16x8_t sum;
+  if (filter_index == 0) {
+    // 6 taps. + - + + - +
+    sum = vmull_u8(src[0], taps[0]);
+    // Unsigned overflow will result in a valid int16_t value.
+    sum = vmlsl_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlsl_u8(sum, src[4], taps[4]);
+    sum = vmlal_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 1 && negative_outside_taps) {
+    // 6 taps. - + + + + -
+    // Set a base we can subtract from.
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlsl_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 1) {
+    // 6 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlal_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 2) {
+    // 8 taps. - + - + + - + -
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlsl_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlsl_u8(sum, src[5], taps[5]);
+    sum = vmlal_u8(sum, src[6], taps[6]);
+    sum = vmlsl_u8(sum, src[7], taps[7]);
+  } else if (filter_index == 3) {
+    // 2 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+  } else if (filter_index == 4) {
+    // 4 taps. - + + -
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlsl_u8(sum, src[3], taps[3]);
+  } else if (filter_index == 5) {
+    // 4 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+  }
+  return vreinterpretq_s16_u16(sum);
+}
+
+template <int filter_index, bool negative_outside_taps>
+int16x8_t SumHorizontalTaps(const uint8_t* const src,
+                            const uint8x8_t* const v_tap) {
+  uint8x8_t v_src[8];
+  const uint8x16_t src_long = vld1q_u8(src);
+  int16x8_t sum;
+
+  if (filter_index < 2) {
+    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
+  } else if (filter_index == 2) {
+    v_src[0] = vget_low_u8(src_long);
+    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+    v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+    v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
+  } else if (filter_index == 3) {
+    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
+  } else if (filter_index > 3) {
+    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
+  }
+  return sum;
+}
+
+template <int filter_index, bool negative_outside_taps>
+uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
+                               const uint8x8_t* const v_tap) {
+  int16x8_t sum =
+      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+  return vqrshrun_n_s16(sum, kFilterBits - 1);
+}
+
+template <int filter_index, bool negative_outside_taps>
+uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
+                               const uint8x8_t* const v_tap) {
+  const int16x8_t sum =
+      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
+
+  return vreinterpretq_u16_s16(
+      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index>
+int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                               const uint8x8_t* const v_tap) {
+  uint16x8_t sum;
+  const uint8x8_t input0 = vld1_u8(src);
+  src += src_stride;
+  const uint8x8_t input1 = vld1_u8(src);
+  uint8x8x2_t input = vzip_u8(input0, input1);
+
+  if (filter_index == 3) {
+    // tap signs : + +
+    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
+    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
+  } else if (filter_index == 4) {
+    // tap signs : - + + -
+    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
+    sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
+    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
+    sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
+  } else {
+    // tap signs : + + + +
+    sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
+    sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
+    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
+    sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
+  }
+
+  return vreinterpretq_s16_u16(sum);
+}
+
+template <int filter_index>
+uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
+                                  const ptrdiff_t src_stride,
+                                  const uint8x8_t* const v_tap) {
+  int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+  return vqrshrun_n_s16(sum, kFilterBits - 1);
+}
+
+template <int filter_index>
+uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
+                                   const ptrdiff_t src_stride,
+                                   const uint8x8_t* const v_tap) {
+  const int16x8_t sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return vreinterpretq_u16_s16(
+      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int num_taps, int step, int filter_index,
+          bool negative_outside_taps = true, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // 4 tap filters are never used when width > 4.
+  if (num_taps != 4 && width > 4) {
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          const uint16x8_t v_sum =
+              HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
+                                                                       v_tap);
+          vst1q_u16(&dest16[x], v_sum);
+        } else {
+          const uint8x8_t result =
+              SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
+                                                                        v_tap);
+          vst1_u8(&dest8[x], result);
+        }
+        x += step;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (++y < height);
+    return;
+  }
+
+  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (width == 4) {
+      int y = 0;
+      do {
+        if (is_2d || is_compound) {
+          const uint16x8_t v_sum =
+              HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
+                                                                       v_tap);
+          vst1_u16(dest16, vget_low_u16(v_sum));
+        } else {
+          const uint8x8_t result =
+              SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
+                                                                        v_tap);
+          StoreLo4(&dest8[0], result);
+        }
+        src += src_stride;
+        dest8 += pred_stride;
+        dest16 += pred_stride;
+      } while (++y < height);
+      return;
+    }
+
+    if (!is_compound) {
+      int y = 0;
+      do {
+        if (is_2d) {
+          const uint16x8_t sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          dest16[0] = vgetq_lane_u16(sum, 0);
+          dest16[1] = vgetq_lane_u16(sum, 2);
+          dest16 += pred_stride;
+          dest16[0] = vgetq_lane_u16(sum, 1);
+          dest16[1] = vgetq_lane_u16(sum, 3);
+          dest16 += pred_stride;
+        } else {
+          const uint8x8_t sum =
+              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+          dest8[0] = vget_lane_u8(sum, 0);
+          dest8[1] = vget_lane_u8(sum, 2);
+          dest8 += pred_stride;
+
+          dest8[0] = vget_lane_u8(sum, 1);
+          dest8[1] = vget_lane_u8(sum, 3);
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y += 2;
+      } while (y < height - 1);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        uint16x8_t sum;
+        const uint8x8_t input = vld1_u8(src);
+        if (filter_index == 3) {  // |num_taps| == 2
+          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
+          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+        } else if (filter_index == 4) {
+          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
+          sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
+          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+          sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
+        } else {
+          assert(filter_index == 5);
+          sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
+          sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
+          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+          sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
+        }
+        // |sum| contains an int16_t value.
+        sum = vreinterpretq_u16_s16(vrshrq_n_s16(
+            vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
+        Store2<0>(dest16, sum);
+      }
+    }
+  }
+}
+
+// Process 16 bit inputs and output 32 bits.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+                                    const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum;
+  if (num_taps == 8) {
+    sum = vmull_lane_s16(src[0], taps_lo, 0);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+    sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum = vmull_lane_s16(src[0], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum = vmull_lane_s16(src[0], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum = vmull_lane_s16(src[0], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+  }
+
+  return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+                                  const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum_lo, sum_hi;
+  if (num_taps == 8) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vcombine_s16(
+        vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+                      vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const int16x8_t taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    int16x8_t srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+      src_x += src_stride;
+      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+        src_x += src_stride;
+        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+          src_x += src_stride;
+          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
+    do {
+      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+      src_x += src_stride;
+
+      const int16x8_t sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const int16x8_t taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
+    if (num_taps >= 6) {
+      srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
+      if (num_taps == 8) {
+        srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
+      }
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+                                      vget_low_s16(srcs[num_taps]));
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      const uint16x8_t results = vreinterpretq_u16_s16(sum);
+      vst1q_u16(dst16, results);
+      dst16 += 4 << 1;
+    } else {
+      const uint8x8_t results = vqmovun_s16(sum);
+
+      StoreLo4(dst8, results);
+      dst8 += dst_stride;
+      StoreHi4(dst8, results);
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y += 2;
+  } while (y < height);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const int16x8_t taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    if (num_taps == 8) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    } else if (num_taps == 4) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    } else if (num_taps == 6) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+    } else if (num_taps == 8) {
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+      srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+      srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+    }
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const uint8x8_t results = vqmovun_s16(sum);
+
+    Store2<0>(dst8, results);
+    dst8 += dst_stride;
+    Store2<1>(dst8, results);
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2<2>(dst8, results);
+    dst8 += dst_stride;
+    Store2<3>(dst8, results);
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y += 4;
+  } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+    const ptrdiff_t dst_stride, const int width, const int height,
+    const int filter_id, const int filter_index) {
+  // Duplicate the absolute value for each tap.  Negative taps are corrected
+  // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
+  uint8x8_t v_tap[kSubPixelTaps];
+  assert(filter_id != 0);
+
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+  }
+
+  if (filter_index == 2) {  // 8 tap.
+    FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    // Check if outside taps are positive.
+    if ((filter_id == 1) | (filter_id == 15)) {
+      FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
+          src, src_stride, dst, dst_stride, width, height, v_tap);
+    } else {
+      FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
+          src, src_stride, dst, dst_stride, width, height, v_tap);
+    }
+  } else if (filter_index == 0) {  // 6 tap.
+    FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else {  // 2 tap.
+    FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+void Convolve2D_NEON(const void* const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+
+  if (vertical_taps == 8) {
+    if (width == 2) {
+      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 6) {
+    if (width == 2) {
+      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 4) {
+    if (width == 2) {
+      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else {  // |vertical_taps| == 2
+    if (width == 2) {
+      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  }
+}
+
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+  uint8x8x3_t ret;
+  const uint8x16_t src_val = vld1q_u8(src_x);
+  ret.val[0] = vget_low_u8(src_val);
+  ret.val[1] = vget_high_u8(src_val);
+  if (grade_x > 1) {
+    ret.val[2] = vld1_u8(src_x + 16);
+  }
+  return ret;
+}
+
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
+inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+
+  return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
+}
+
+template <int grade_x>
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+                                         const ptrdiff_t src_stride,
+                                         const int width, const int subpixel_x,
+                                         const int step_x,
+                                         const int intermediate_height,
+                                         int16_t* intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps.
+  const int kernel_offset = 3;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
+  const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+  int p = subpixel_x;
+  if (width <= 4) {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+    // This is a special case. The 2-tap filter has no negative taps, so we
+    // can use unsigned values.
+    // For each x, a lane of tapsK has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+                               VQTbl1U8(filter_taps1, filter_indices)};
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16_t src_vals = vld1q_u8(src_x);
+      const uint8x8_t src_indices =
+          vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+      // For each x, a lane of srcK contains src_x[k].
+      const uint8x8_t src[2] = {
+          VQTbl1U8(src_vals, src_indices),
+          VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+      vst1q_s16(intermediate,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate += kIntermediateStride;
+    } while (++y < intermediate_height);
+    return;
+  }
+
+  // |width| >= 8
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // This is a special case. The 2-tap filter has no negative taps, so we
+    // can use unsigned values.
+    // For each x, a lane of tapsK has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+                               VQTbl1U8(filter_taps1, filter_indices)};
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+      const uint8x8_t src_indices =
+          vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+      // For each x, a lane of srcK contains src_x[k].
+      const uint8x8_t src[2] = {
+          vtbl3_u8(src_vals, src_indices),
+          vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (++y < intermediate_height);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
+inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+
+  return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+void ConvolveKernelHorizontalPositive4Tap(
+    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const int step_x, const int intermediate_height, int16_t* intermediate) {
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
+  const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
+  const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
+  const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const int p = subpixel_x;
+  // First filter is special, just a 128 tap on the center.
+  const uint8_t* src_x =
+      &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+  const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+  const uint8x8_t filter_indices = vand_u8(
+      vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
+  // Note that filter_id depends on x.
+  // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+  const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+                             VQTbl1U8(filter_taps1, filter_indices),
+                             VQTbl1U8(filter_taps2, filter_indices),
+                             VQTbl1U8(filter_taps3, filter_indices)};
+
+  const uint8x8_t src_indices =
+      vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+  int y = 0;
+  do {
+    // Load a pool of samples to select from using stepped index vectors.
+    const uint8x16_t src_vals = vld1q_u8(src_x);
+
+    // For each x, srcK contains src_x[k] where k=1.
+    // Whereas taps come from different arrays, src pixels are drawn from the
+    // same contiguous line.
+    const uint8x8_t src[4] = {
+        VQTbl1U8(src_vals, src_indices),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
+
+    vst1q_s16(intermediate,
+              vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
+                           kInterRoundBitsHorizontal - 1));
+
+    src_x += src_stride;
+    intermediate += kIntermediateStride;
+  } while (++y < intermediate_height);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
+inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+          {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
+
+  return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalSigned4Tap(
+    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const int step_x, const int intermediate_height, int16_t* intermediate) {
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
+  const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
+  const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
+  const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
+  const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
+                                            static_cast<uint16_t>(step_x));
+
+  const int p = subpixel_x;
+  const uint8_t* src_x =
+      &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
+  const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
+  const uint8x8_t filter_index_offsets = vshrn_n_u16(
+      vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
+  const uint8x8_t filter_indices =
+      vand_u8(filter_index_offsets, filter_index_mask);
+  // Note that filter_id depends on x.
+  // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+  const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+                             VQTbl1U8(filter_taps1, filter_indices),
+                             VQTbl1U8(filter_taps2, filter_indices),
+                             VQTbl1U8(filter_taps3, filter_indices)};
+
+  const uint8x8_t src_indices_base =
+      vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
+
+  const uint8x8_t src_indices[4] = {src_indices_base,
+                                    vadd_u8(src_indices_base, vdup_n_u8(1)),
+                                    vadd_u8(src_indices_base, vdup_n_u8(2)),
+                                    vadd_u8(src_indices_base, vdup_n_u8(3))};
+
+  int y = 0;
+  do {
+    // Load a pool of samples to select from using stepped indices.
+    const uint8x16_t src_vals = vld1q_u8(src_x);
+
+    // For each x, srcK contains src_x[k] where k=1.
+    // Whereas taps come from different arrays, src pixels are drawn from the
+    // same contiguous line.
+    const uint8x8_t src[4] = {
+        VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
+        VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
+
+    vst1q_s16(intermediate,
+              vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
+                           kInterRoundBitsHorizontal - 1));
+    src_x += src_stride;
+    intermediate += kIntermediateStride;
+  } while (++y < intermediate_height);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+          {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+          {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
+          {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x16_t filter_taps[6];
+  for (int i = 0; i < 6; ++i) {
+    filter_taps[i] = GetSigned6TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    // Avoid overloading outside the reference boundaries. This means
+    // |trailing_width| can be up to 24.
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[6];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    uint8x8_t taps[6];
+    for (int i = 0; i < 6; ++i) {
+      taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+    }
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      const uint8x8_t src[6] = {
+          vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+          vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+          vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (++y < intermediate_height);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps which are handled in
+// GetMixed6TapFilter().
+inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
+
+  return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
+}
+
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
+      {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+  return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x8_t taps[4];
+  int16x8_t mixed_taps[2];
+  uint8x16_t positive_filter_taps[4];
+  for (int i = 0; i < 4; ++i) {
+    positive_filter_taps[i] = GetPositive6TapFilter(i);
+  }
+  int8x16_t mixed_filter_taps[2];
+  mixed_filter_taps[0] = GetMixed6TapFilter(0);
+  mixed_filter_taps[1] = GetMixed6TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[6];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    for (int i = 0; i < 4; ++i) {
+      taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
+    }
+    mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
+    mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
+
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      int16x8_t sum_mixed = vmulq_s16(
+          mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
+      sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
+                            ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
+      uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
+      sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
+      sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
+      sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
+      sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
+
+      vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (++y < intermediate_height);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
+  assert(tap_index < 8);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+          {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
+          {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+          {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
+          {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+          {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+          {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
+          {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+          {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
+
+  return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* intermediate) {
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x8_t taps[8];
+  uint8x16_t filter_taps[8];
+  for (int i = 0; i < 8; ++i) {
+    filter_taps[i] = GetSigned8TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[8];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 8; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    for (int i = 0; i < 8; ++i) {
+      taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+    }
+
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      const uint8x8_t src[8] = {
+          vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+          vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+          vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
+          vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (++y < intermediate_height);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// This function handles blocks of width 2 or 4.
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+                              const int filter_index, const int step_y,
+                              const int height, void* dest,
+                              const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  uint16_t* dest16_y = static_cast<uint16_t*>(dest);
+  uint8_t* dest_y = static_cast<uint8_t*>(dest);
+  int16x4_t s[num_taps + grade_y];
+
+  int p = subpixel_y & 1023;
+  int prev_p = p;
+  int y = 0;
+  do {  // y < height
+    for (int i = 0; i < num_taps; ++i) {
+      s[i] = vld1_s16(src_y + i * src_stride);
+    }
+    int filter_id = (p >> 6) & kSubPixelMask;
+    int16x8_t filter =
+        vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result = vreinterpret_u16_s16(sums);
+      vst1_u16(dest16_y, result);
+    } else {
+      const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        StoreLo4(dest_y, result);
+      }
+    }
+    p += step_y;
+    const int p_diff =
+        (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+    prev_p = p;
+    // Here we load extra source in case it is needed. If |p_diff| == 0, these
+    // values will be unused, but it's faster to load than to branch.
+    s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+    if (grade_y > 1) {
+      s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+    }
+    dest16_y += dest_stride;
+    dest_y += dest_stride;
+
+    filter_id = (p >> 6) & kSubPixelMask;
+    filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result = vreinterpret_u16_s16(sums);
+      vst1_u16(dest16_y, result);
+    } else {
+      const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        StoreLo4(dest_y, result);
+      }
+    }
+    p += step_y;
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    prev_p = p;
+    dest16_y += dest_stride;
+    dest_y += dest_stride;
+
+    y += 2;
+  } while (y < height);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* src, const int width,
+                                  const int subpixel_y, const int filter_index,
+                                  const int step_y, const int height,
+                                  void* dest, const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  // A possible improvement is to use arithmetic to decide how many times to
+  // apply filters to same source before checking whether to load new srcs.
+  // However, this will only improve performance with very small step sizes.
+  int16x8_t s[num_taps + grade_y];
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  uint16_t* dest16_y;
+  uint8_t* dest_y;
+
+  int x = 0;
+  do {  // x < width
+    const int16_t* src_x = src + x;
+    const int16_t* src_y = src_x;
+    dest16_y = static_cast<uint16_t*>(dest) + x;
+    dest_y = static_cast<uint8_t*>(dest) + x;
+    int p = subpixel_y & 1023;
+    int prev_p = p;
+    int y = 0;
+    do {  // y < height
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = vld1q_s16(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      int16x8_t filter =
+          vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+      if (is_compound) {
+        vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dest_y, vqmovun_s16(sum));
+      }
+      p += step_y;
+      const int p_diff =
+          (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+      // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
+      // needed. Otherwise, we only need to load one vector because |p_diff|
+      // can't exceed 1.
+      s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+      if (grade_y > 1) {
+        s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+      }
+      dest16_y += dest_stride;
+      dest_y += dest_stride;
+
+      filter_id = (p >> 6) & kSubPixelMask;
+      filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+      if (is_compound) {
+        vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dest_y, vqmovun_s16(sum));
+      }
+      p += step_y;
+      src_y = src_x + (p >> kScaleSubPixelBits) * src_stride;
+      prev_p = p;
+      dest16_y += dest_stride;
+      dest_y += dest_stride;
+
+      y += 2;
+    } while (y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index, const int subpixel_x,
+                          const int subpixel_y, const int step_x,
+                          const int step_y, const int width, const int height,
+                          void* prediction, const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+  assert(step_x <= 2048);
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (2 * kMaxSuperBlockSizeInPixels + 8)];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base subpel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // larger structure and use a larger table lookup in order to gather all
+  // filter inputs.
+  // |num_taps| - 1 is the shuffle index of the final filter input.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+  switch (filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned6Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned6Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+
+      } else {
+        ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned8Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned8Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      } else {
+        ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
+                                         intermediate_height, intermediate);
+      break;
+    default:
+      assert(filter_index == 5);
+      ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
+                                           intermediate_height, intermediate);
+  }
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+
+  switch (filter_index) {
+    case 0:
+    case 1:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 1, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 2, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      }
+      break;
+    case 2:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 1, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 2, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      }
+      break;
+    case 3:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 1, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 2, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      }
+      break;
+    case 4:
+    default:
+      assert(filter_index == 4 || filter_index == 5);
+      assert(height <= 4);
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 1, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 2, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      }
+  }
+}
+
+void ConvolveHorizontal_NEON(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int horizontal_filter_index,
+                             const int /*vertical_filter_index*/,
+                             const int horizontal_filter_id,
+                             const int /*vertical_filter_id*/, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+uint16x8_t Compound1DShift(const int16x8_t sum) {
+  return vreinterpretq_u16_s16(
+      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index, bool is_compound = false,
+          bool negative_outside_taps = false>
+void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+                    void* const dst, const ptrdiff_t dst_stride,
+                    const int width, const int height,
+                    const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    uint8x8_t srcs[8];
+    srcs[0] = vld1_u8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = vld1_u8(src_x);
+      src_x += src_stride;
+      srcs[2] = vld1_u8(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = vld1_u8(src_x);
+        src_x += src_stride;
+        srcs[4] = vld1_u8(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = vld1_u8(src_x);
+          src_x += src_stride;
+          srcs[6] = vld1_u8(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
+    do {
+      srcs[next_row] = vld1_u8(src_x);
+      src_x += src_stride;
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+        vst1q_u16(dst16 + x + y * dst_stride, results);
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+        vst1_u8(dst8 + x + y * dst_stride, results);
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false,
+          bool negative_outside_taps = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  uint8x8_t srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load4<1>(src, srcs[0]);
+      src += src_stride;
+      srcs[2] = Load4<0>(src, srcs[2]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+    int y = 0;
+    do {
+      srcs[2] = Load4<1>(src, srcs[2]);
+      src += src_stride;
+      srcs[4] = Load4<0>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    srcs[6] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+    srcs[2] = Load4<1>(src, srcs[2]);
+    src += src_stride;
+    srcs[4] = Load4(src);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+    int y = 0;
+    do {
+      srcs[4] = Load4<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[6] = Load4<0>(src, srcs[6]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+    srcs[2] = Load4<1>(src, srcs[2]);
+    src += src_stride;
+    srcs[4] = Load4(src);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+    srcs[4] = Load4<1>(src, srcs[4]);
+    src += src_stride;
+    srcs[6] = Load4(src);
+    src += src_stride;
+    srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+    int y = 0;
+    do {
+      srcs[6] = Load4<1>(src, srcs[6]);
+      src += src_stride;
+      srcs[8] = Load4<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[7] = vext_u8(srcs[6], srcs[8], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y += 2;
+    } while (y < height);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  uint8x8_t srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[2], 2);
+
+      // This uses srcs[0]..srcs[1].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+      // This uses srcs[0]..srcs[3].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    srcs[4] = Load2(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+
+    int y = 0;
+    do {
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+
+      // This uses srcs[0]..srcs[5].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    srcs[4] = Load2(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+    int y = 0;
+    do {
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      srcs[6] = vext_u8(srcs[4], srcs[8], 4);
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+      srcs[7] = vext_u8(srcs[4], srcs[8], 6);
+
+      // This uses srcs[0]..srcs[7].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  }
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+void ConvolveVertical_NEON(const void* const reference,
+                           const ptrdiff_t reference_stride,
+                           const int /*horizontal_filter_index*/,
+                           const int vertical_filter_index,
+                           const int /*horizontal_filter_id*/,
+                           const int vertical_filter_id, const int width,
+                           const int height, void* prediction,
+                           const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  uint8x8_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
+                                    (vertical_filter_id == 15))) {  // 5 tap.
+    if (width == 2) {
+      FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((filter_index == 1) &
+             ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
+              (vertical_filter_id == 9))) {  // 6 tap with weird negative taps.
+    if (width == 2) {
+      FilterVertical2xH<1,
+                        /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, height, taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/false,
+                        /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 3);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    // Outside taps are negative.
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+    // below map to 4 tap filters.
+    assert(filter_index == 5 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int final_shift =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+  if (width >= 16) {
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        const uint8x16_t v_src = vld1q_u8(&src[x]);
+        const uint16x8_t v_dest_lo =
+            vshll_n_u8(vget_low_u8(v_src), final_shift);
+        const uint16x8_t v_dest_hi =
+            vshll_n_u8(vget_high_u8(v_src), final_shift);
+        vst1q_u16(&dest[x], v_dest_lo);
+        x += 8;
+        vst1q_u16(&dest[x], v_dest_hi);
+        x += 8;
+      } while (x < width);
+      src += src_stride;
+      dest += width;
+    } while (++y < height);
+  } else if (width == 8) {
+    int y = 0;
+    do {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+      vst1q_u16(&dest[0], v_dest);
+      src += src_stride;
+      dest += width;
+    } while (++y < height);
+  } else { /* width == 4 */
+    uint8x8_t v_src = vdup_n_u8(0);
+
+    int y = 0;
+    do {
+      v_src = Load4<0>(&src[0], v_src);
+      src += src_stride;
+      v_src = Load4<1>(&src[0], v_src);
+      src += src_stride;
+      const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+      vst1q_u16(&dest[0], v_dest);
+      dest += 4 << 1;
+      y += 2;
+    } while (y < height);
+  }
+}
+
+void ConvolveCompoundVertical_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int vertical_filter_index,
+    const int /*horizontal_filter_id*/, const int vertical_filter_id,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  uint8x8_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
+                                    (vertical_filter_id == 15))) {  // 5 tap.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((filter_index == 1) &
+             ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
+              (vertical_filter_id == 9))) {  // 6 tap with weird negative taps.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true,
+                        /*negative_outside_taps=*/true>(src, src_stride, dest,
+                                                        4, height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
+          src, src_stride, dest, width, width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 3);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 3);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+    // to 4 tap filters.
+    assert(filter_index == 5 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundHorizontal_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int horizontal_filter_index, const int /*vertical_filter_index*/,
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_NEON(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int horizontal_filter_index,
+                             const int vertical_filter_index,
+                             const int horizontal_filter_id,
+                             const int vertical_filter_id, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  const ptrdiff_t dest_stride = width;
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+
+  if (vertical_taps == 8) {
+    if (width == 4) {
+      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 6) {
+    if (width == 4) {
+      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 4) {
+    if (width == 4) {
+      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else {  // |vertical_taps| == 2
+    if (width == 4) {
+      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  }
+}
+
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+  const uint8x16_t left = vld1q_u8(src);
+  const uint8x16_t right = vld1q_u8(src + 1);
+  vst1q_u8(dst, vrhaddq_u8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+                                     const ptrdiff_t src_stride,
+                                     const int height, uint8_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = 0;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (++y < height);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = 0;
+    do {
+      const uint8x8_t left = vld1_u8(src);
+      const uint8x8_t right = vld1_u8(src + 1);
+      vst1_u8(dest, vrhadd_u8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (++y < height);
+  } else if (width == 4) {
+    uint8x8_t left = vdup_n_u8(0);
+    uint8x8_t right = vdup_n_u8(0);
+    int y = 0;
+    do {
+      left = Load4<0>(src, left);
+      right = Load4<0>(src + 1, right);
+      src += reference_stride;
+      left = Load4<1>(src, left);
+      right = Load4<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint8x8_t result = vrhadd_u8(left, right);
+
+      StoreLo4(dest, result);
+      dest += pred_stride;
+      StoreHi4(dest, result);
+      dest += pred_stride;
+      y += 2;
+    } while (y < height);
+  } else {
+    assert(width == 2);
+    uint8x8_t left = vdup_n_u8(0);
+    uint8x8_t right = vdup_n_u8(0);
+    int y = 0;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<1>(src, left);
+      right = Load2<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint8x8_t result = vrhadd_u8(left, right);
+
+      Store2<0>(dest, result);
+      dest += pred_stride;
+      Store2<1>(dest, result);
+      dest += pred_stride;
+      y += 2;
+    } while (y < height);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* dst, const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  uint8x16_t row[8], below[8];
+
+  row[0] = vld1q_u8(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = vld1q_u8(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = vld1q_u8(src);
+      src += 16;
+      row[3] = vld1q_u8(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = vld1q_u8(src);
+        src += 16;
+        row[5] = vld1q_u8(src);
+        src += 16;
+        row[6] = vld1q_u8(src);
+        src += 16;
+        row[7] = vld1q_u8(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = 0;
+  do {
+    below[0] = vld1q_u8(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = vld1q_u8(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = vld1q_u8(src);
+        src += 16;
+        below[3] = vld1q_u8(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = vld1q_u8(src);
+          src += 16;
+          below[5] = vld1q_u8(src);
+          src += 16;
+          below[6] = vld1q_u8(src);
+          src += 16;
+          below[7] = vld1q_u8(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (++y < height);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    uint8x8_t row, below;
+    row = vld1_u8(src);
+    src += reference_stride;
+
+    int y = 0;
+    do {
+      below = vld1_u8(src);
+      src += reference_stride;
+
+      vst1_u8(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (++y < height);
+  } else if (width == 4) {
+    uint8x8_t row = Load4(src);
+    uint8x8_t below = vdup_n_u8(0);
+    src += reference_stride;
+
+    int y = 0;
+    do {
+      below = Load4<0>(src, below);
+      src += reference_stride;
+
+      StoreLo4(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (++y < height);
+  } else {
+    assert(width == 2);
+    uint8x8_t row = Load2(src);
+    uint8x8_t below = vdup_n_u8(0);
+    src += reference_stride;
+
+    int y = 0;
+    do {
+      below = Load2<0>(src, below);
+      src += reference_stride;
+
+      Store2<0>(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (++y < height);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+                             const int height, uint8_t* dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  uint16x8_t row[16];
+  row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+  if (width >= 16) {
+    src += 8;
+    row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+    if (width >= 32) {
+      src += 8;
+      row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      src += 8;
+      row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      if (width >= 64) {
+        src += 8;
+        row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        if (width == 128) {
+          src += 8;
+          row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = 0;
+  do {
+    const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+    vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_10 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_11 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_12 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_13 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_14 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_15 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (++y < height);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 4) {
+    uint8x8_t left = Load4(src);
+    uint8x8_t right = Load4(src + 1);
+    src += reference_stride;
+
+    uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+    int y = 0;
+    do {
+      left = Load4<0>(src, left);
+      right = Load4<0>(src + 1, right);
+      src += reference_stride;
+      left = Load4<1>(src, left);
+      right = Load4<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint16x8_t below = vaddl_u8(left, right);
+
+      const uint8x8_t result = vrshrn_n_u16(
+          vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+      StoreLo4(dest, result);
+      dest += pred_stride;
+      StoreHi4(dest, result);
+      dest += pred_stride;
+
+      row = vget_high_u16(below);
+      y += 2;
+    } while (y < height);
+  } else {
+    uint8x8_t left = Load2(src);
+    uint8x8_t right = Load2(src + 1);
+    src += reference_stride;
+
+    uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+    int y = 0;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<2>(src, left);
+      right = Load2<2>(src + 1, right);
+      src += reference_stride;
+
+      const uint16x8_t below = vaddl_u8(left, right);
+
+      const uint8x8_t result = vrshrn_n_u16(
+          vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+      Store2<0>(dest, result);
+      dest += pred_stride;
+      Store2<2>(dest, result);
+      dest += pred_stride;
+
+      row = vget_high_u16(below);
+      y += 2;
+    } while (y < height);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+  dsp->convolve[0][0][1][1] = Convolve2D_NEON;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/convolve_neon.h b/src/dsp/arm/convolve_neon.h
new file mode 100644
index 0000000..948ef4d
--- /dev/null
+++ b/src/dsp/arm/convolve_neon.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve. This function is not thread-safe.
+void ConvolveInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
new file mode 100644
index 0000000..04952ab
--- /dev/null
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -0,0 +1,203 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+                                         const int16x8_t pred1,
+                                         const int16x4_t weights[2]) {
+  // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+  const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0));
+  const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0));
+  const int32x4_t blended_lo =
+      vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1));
+  const int32x4_t blended_hi =
+      vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1));
+
+  return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4),
+                      vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4));
+}
+
+template <int width, int height>
+inline void DistanceWeightedBlendSmall_NEON(const int16_t* prediction_0,
+                                            const int16_t* prediction_1,
+                                            const int16x4_t weights[2],
+                                            void* const dest,
+                                            const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  constexpr int step = 16 / width;
+
+  for (int y = 0; y < height; y += step) {
+    const int16x8_t src_00 = vld1q_s16(prediction_0);
+    const int16x8_t src_10 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+    const int16x8_t src_01 = vld1q_s16(prediction_0);
+    const int16x8_t src_11 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+    const uint8x8_t result0 = vqmovun_s16(res0);
+    const uint8x8_t result1 = vqmovun_s16(res1);
+    if (width == 4) {
+      StoreLo4(dst, result0);
+      dst += dest_stride;
+      StoreHi4(dst, result0);
+      dst += dest_stride;
+      StoreLo4(dst, result1);
+      dst += dest_stride;
+      StoreHi4(dst, result1);
+      dst += dest_stride;
+    } else {
+      assert(width == 8);
+      vst1_u8(dst, result0);
+      dst += dest_stride;
+      vst1_u8(dst, result1);
+      dst += dest_stride;
+    }
+  }
+}
+
+inline void DistanceWeightedBlendLarge_NEON(const int16_t* prediction_0,
+                                            const int16_t* prediction_1,
+                                            const int16x4_t weights[2],
+                                            const int width, const int height,
+                                            void* const dest,
+                                            const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
+      const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
+      const int16x8_t res_lo =
+          ComputeWeightedAverage8(src0_lo, src1_lo, weights);
+
+      const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
+      const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
+      const int16x8_t res_hi =
+          ComputeWeightedAverage8(src0_hi, src1_hi, weights);
+
+      const uint8x16_t result =
+          vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi));
+      vst1q_u8(dst + x, result);
+      x += 16;
+    } while (x < width);
+    dst += dest_stride;
+    prediction_0 += width;
+    prediction_1 += width;
+  } while (--y != 0);
+}
+
+inline void DistanceWeightedBlend_NEON(const void* prediction_0,
+                                       const void* prediction_1,
+                                       const uint8_t weight_0,
+                                       const uint8_t weight_1, const int width,
+                                       const int height, void* const dest,
+                                       const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
+  // TODO(johannkoenig): Investigate the branching. May be fine to call with a
+  // variable height.
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest,
+                                            dest_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest,
+                                            dest_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest,
+                                             dest_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest,
+                                              dest_stride);
+        return;
+      case 8:
+        DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest,
+                                              dest_stride);
+        return;
+      case 16:
+        DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest,
+                                               dest_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest,
+                                               dest_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest,
+                                  dest_stride);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+}  // namespace
+
+void DistanceWeightedBlendInit_NEON() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h
new file mode 100644
index 0000000..4d8824c
--- /dev/null
+++ b/src/dsp/arm/distance_weighted_blend_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If NEON is enabled signal the NEON implementation should be used instead of
+// normal C.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
new file mode 100644
index 0000000..2612466
--- /dev/null
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -0,0 +1,1188 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/arm/film_grain_neon.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// These functions are overloaded for both possible sizes in order to simplify
+// loading and storing to and from intermediate value types from within a
+// template function.
+inline int16x8_t GetSignedSource8(const int8_t* src) {
+  return vmovl_s8(vld1_s8(src));
+}
+
+inline int16x8_t GetSignedSource8(const uint8_t* src) {
+  return ZeroExtend(vld1_u8(src));
+}
+
+inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
+  vst1_u8(dest, vmovn_u16(data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
+
+inline int16x8_t GetSignedSource8(const uint16_t* src) {
+  return vreinterpretq_s16_u16(vld1q_u16(src));
+}
+
+inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
+  vst1q_u16(dest, data);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Each element in |sum| represents one destination value's running
+// autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
+// allow for a sliding window in successive calls to this function.
+template <int position_offset>
+inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
+                                           const int16x8_t grain_hi,
+                                           int16_t coeff, int32x4x2_t sum) {
+  const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
+  return sum;
+}
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum,
+                                     const int8_t* coeffs, int pos, int shift) {
+  int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+  for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+    result += grain_cursor[lane + delta_col] * coeffs[pos];
+    ++pos;
+  }
+  grain_cursor[lane] =
+      Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+            GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum,
+                                     const int8_t* coeffs, int pos, int shift) {
+  int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+  for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+    result += grain_cursor[lane + delta_col] * coeffs[pos];
+    ++pos;
+  }
+  grain_cursor[lane] =
+      Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+            GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor,
+                                           int8_t* v_grain_cursor,
+                                           int32x4x2_t sum_u, int32x4x2_t sum_v,
+                                           const int8_t* coeffs_u,
+                                           const int8_t* coeffs_v, int pos,
+                                           int shift) {
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      u_grain_cursor, sum_u, coeffs_u, pos, shift);
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor,
+                                           int16_t* v_grain_cursor,
+                                           int32x4x2_t sum_u, int32x4x2_t sum_v,
+                                           const int8_t* coeffs_u,
+                                           const int8_t* coeffs_v, int pos,
+                                           int shift) {
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      u_grain_cursor, sum_u, coeffs_u, pos, shift);
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline void SetZero(int32x4x2_t* v) {
+  v->val[0] = vdupq_n_s32(0);
+  v->val[1] = vdupq_n_s32(0);
+}
+
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
+                            int subsampling_y, ptrdiff_t stride) {
+  if (subsampling_y != 0) {
+    assert(subsampling_x != 0);
+    const int8x16_t src0 = vld1q_s8(luma);
+    const int8x16_t src1 = vld1q_s8(luma + stride);
+    const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
+                                        vpaddl_s8(vget_high_s8(src0)));
+    const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
+                                        vpaddl_s8(vget_high_s8(src1)));
+    return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
+  }
+  if (subsampling_x != 0) {
+    const int8x16_t src = vld1q_s8(luma);
+    return vrshrq_n_s16(
+        vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
+        1);
+  }
+  return vmovl_s8(vld1_s8(luma));
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    const uint8x16_t src = vld1q_u8(luma);
+    return vrshrq_n_u16(vpaddlq_u8(src), 1);
+  }
+  return vmovl_u8(vld1_u8(luma));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
+                            int subsampling_y, ptrdiff_t stride) {
+  if (subsampling_y != 0) {
+    assert(subsampling_x != 0);
+    int16x8_t src0_lo = vld1q_s16(luma);
+    int16x8_t src0_hi = vld1q_s16(luma + 8);
+    const int16x8_t src1_lo = vld1q_s16(luma + stride);
+    const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
+    const int16x8_t src0 =
+        vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
+                     vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
+    const int16x8_t src1 =
+        vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
+                     vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
+    return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
+  }
+  if (subsampling_x != 0) {
+    const int16x8_t src_lo = vld1q_s16(luma);
+    const int16x8_t src_hi = vld1q_s16(luma + 8);
+    const int16x8_t ret =
+        vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
+                     vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
+    return vrshrq_n_s16(ret, 1);
+  }
+  return vld1q_s16(luma);
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
+                                 int subsampling_x) {
+  if (subsampling_x != 0) {
+    const uint16x8x2_t src = vld2q_u16(luma);
+    return vrhaddq_u16(src.val[0], src.val[1]);
+  }
+  return vld1q_u16(luma);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+          bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams& params,
+                                                  const void* luma_grain_buffer,
+                                                  int subsampling_x,
+                                                  int subsampling_y,
+                                                  void* u_grain_buffer,
+                                                  void* v_grain_buffer) {
+  static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
+  const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+  auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+  auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+  const int auto_regression_shift = params.auto_regression_shift;
+  const int chroma_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int chroma_height =
+      (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+  // When |chroma_width| == 44, we write 8 at a time from x in [3, 34],
+  // leaving [35, 40] to write at the end.
+  const int chroma_width_remainder =
+      (chroma_width - 2 * kAutoRegressionBorder) & 7;
+
+  int y = kAutoRegressionBorder;
+  luma_grain += kLumaWidth * y;
+  u_grain += chroma_width * y;
+  v_grain += chroma_width * y;
+  do {
+    // Each row is computed 8 values at a time in the following loop. At the
+    // end of the loop, 4 values remain to write. They are given a special
+    // reduced iteration at the end.
+    int x = kAutoRegressionBorder;
+    int luma_x = kAutoRegressionBorder;
+    do {
+      int pos = 0;
+      int32x4x2_t sum_u;
+      int32x4x2_t sum_v;
+      SetZero(&sum_u);
+      SetZero(&sum_v);
+
+      if (auto_regression_coeff_lag > 0) {
+        for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+             ++delta_row) {
+          // These loads may overflow to the next row, but they are never called
+          // on the final row of a grain block. Therefore, they will never
+          // exceed the block boundaries.
+          // Note: this could be slightly optimized to a single load in 8bpp,
+          // but requires making a special first iteration and accumulate
+          // function that takes an int8x16_t.
+          const int16x8_t u_grain_lo =
+              GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag);
+          const int16x8_t u_grain_hi =
+              GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag + 8);
+          const int16x8_t v_grain_lo =
+              GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag);
+          const int16x8_t v_grain_hi =
+              GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag + 8);
+#define ACCUMULATE_WEIGHTED_GRAIN(offset)                                  \
+  sum_u = AccumulateWeightedGrain<offset>(                                 \
+      u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \
+  sum_v = AccumulateWeightedGrain<offset>(                                 \
+      v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v)
+
+          ACCUMULATE_WEIGHTED_GRAIN(0);
+          ACCUMULATE_WEIGHTED_GRAIN(1);
+          ACCUMULATE_WEIGHTED_GRAIN(2);
+          // The horizontal |auto_regression_coeff_lag| loop is replaced with
+          // if-statements to give vextq_s16 an immediate param.
+          if (auto_regression_coeff_lag > 1) {
+            ACCUMULATE_WEIGHTED_GRAIN(3);
+            ACCUMULATE_WEIGHTED_GRAIN(4);
+          }
+          if (auto_regression_coeff_lag > 2) {
+            assert(auto_regression_coeff_lag == 3);
+            ACCUMULATE_WEIGHTED_GRAIN(5);
+            ACCUMULATE_WEIGHTED_GRAIN(6);
+          }
+        }
+      }
+
+      if (use_luma) {
+        const int16x8_t luma = GetSubsampledLuma(
+            luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+        // Luma samples get the final coefficient in the formula, but are best
+        // computed all at once before the final row.
+        const int coeff_u =
+            params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+        const int coeff_v =
+            params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+        sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+        sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+        sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+        sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+      }
+      // At this point in the filter, the source addresses and destination
+      // addresses overlap. Because this is an auto-regressive filter, the
+      // higher lanes cannot be computed without the results of the lower lanes.
+      // Each call to WriteFinalAutoRegression incorporates preceding values
+      // on the final row, and writes a single sample. This allows the next
+      // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane)                                    \
+  WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>(  \
+      u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \
+      params.auto_regression_coeff_v, pos, auto_regression_shift)
+
+      WRITE_AUTO_REGRESSION_RESULT(0);
+      WRITE_AUTO_REGRESSION_RESULT(1);
+      WRITE_AUTO_REGRESSION_RESULT(2);
+      WRITE_AUTO_REGRESSION_RESULT(3);
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+      WRITE_AUTO_REGRESSION_RESULT(6);
+      WRITE_AUTO_REGRESSION_RESULT(7);
+
+      x += 8;
+      luma_x += 8 << subsampling_x;
+    } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder);
+
+    // This is the "final iteration" of the above loop over width. We fill in
+    // the remainder of the width, which is less than 8.
+    int pos = 0;
+    int32x4x2_t sum_u;
+    int32x4x2_t sum_v;
+    SetZero(&sum_u);
+    SetZero(&sum_v);
+
+    for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+         ++delta_row) {
+      // These loads may overflow to the next row, but they are never called on
+      // the final row of a grain block. Therefore, they will never exceed the
+      // block boundaries.
+      const int16x8_t u_grain_lo = GetSignedSource8(
+          u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+      const int16x8_t u_grain_hi =
+          GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                           auto_regression_coeff_lag + 8);
+      const int16x8_t v_grain_lo = GetSignedSource8(
+          v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+      const int16x8_t v_grain_hi =
+          GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                           auto_regression_coeff_lag + 8);
+
+      ACCUMULATE_WEIGHTED_GRAIN(0);
+      ACCUMULATE_WEIGHTED_GRAIN(1);
+      ACCUMULATE_WEIGHTED_GRAIN(2);
+      // The horizontal |auto_regression_coeff_lag| loop is replaced with
+      // if-statements to give vextq_s16 an immediate param.
+      if (auto_regression_coeff_lag > 1) {
+        ACCUMULATE_WEIGHTED_GRAIN(3);
+        ACCUMULATE_WEIGHTED_GRAIN(4);
+      }
+      if (auto_regression_coeff_lag > 2) {
+        assert(auto_regression_coeff_lag == 3);
+        ACCUMULATE_WEIGHTED_GRAIN(5);
+        ACCUMULATE_WEIGHTED_GRAIN(6);
+      }
+    }
+
+    if (use_luma) {
+      const int16x8_t luma = GetSubsampledLuma(
+          luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+      // Luma samples get the final coefficient in the formula, but are best
+      // computed all at once before the final row.
+      const int coeff_u =
+          params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+      const int coeff_v =
+          params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+      sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+      sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+      sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+      sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+    }
+
+    WRITE_AUTO_REGRESSION_RESULT(0);
+    WRITE_AUTO_REGRESSION_RESULT(1);
+    WRITE_AUTO_REGRESSION_RESULT(2);
+    WRITE_AUTO_REGRESSION_RESULT(3);
+    if (chroma_width_remainder == 6) {
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+    }
+
+    luma_grain += kLumaWidth << subsampling_y;
+    u_grain += chroma_width;
+    v_grain += chroma_width;
+  } while (++y < chroma_height);
+#undef ACCUMULATE_WEIGHTED_GRAIN
+#undef WRITE_AUTO_REGRESSION_RESULT
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag>
+void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params,
+                                               void* luma_grain_buffer) {
+  static_assert(auto_regression_coeff_lag > 0, "");
+  const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
+  const uint8_t auto_regression_shift = params.auto_regression_shift;
+
+  int y = kAutoRegressionBorder;
+  auto* luma_grain =
+      static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y;
+  do {
+    // Each row is computed 8 values at a time in the following loop. At the
+    // end of the loop, 4 values remain to write. They are given a special
+    // reduced iteration at the end.
+    int x = kAutoRegressionBorder;
+    do {
+      int pos = 0;
+      int32x4x2_t sum;
+      SetZero(&sum);
+      for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+           ++delta_row) {
+        // These loads may overflow to the next row, but they are never called
+        // on the final row of a grain block. Therefore, they will never exceed
+        // the block boundaries.
+        const int16x8_t src_grain_lo =
+            GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                             auto_regression_coeff_lag);
+        const int16x8_t src_grain_hi =
+            GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                             auto_regression_coeff_lag + 8);
+
+        // A pictorial representation of the auto-regressive filter for
+        // various values of params.auto_regression_coeff_lag. The letter 'O'
+        // represents the current sample. (The filter always operates on the
+        // current sample with filter coefficient 1.) The letters 'X'
+        // represent the neighboring samples that the filter operates on, below
+        // their corresponding "offset" number.
+        //
+        // params.auto_regression_coeff_lag == 3:
+        //   0 1 2 3 4 5 6
+        //   X X X X X X X
+        //   X X X X X X X
+        //   X X X X X X X
+        //   X X X O
+        // params.auto_regression_coeff_lag == 2:
+        //     0 1 2 3 4
+        //     X X X X X
+        //     X X X X X
+        //     X X O
+        // params.auto_regression_coeff_lag == 1:
+        //       0 1 2
+        //       X X X
+        //       X O
+        // params.auto_regression_coeff_lag == 0:
+        //         O
+        // The function relies on the caller to skip the call in the 0 lag
+        // case.
+
+#define ACCUMULATE_WEIGHTED_GRAIN(offset)                           \
+  sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
+                                        auto_regression_coeff_y[pos++], sum)
+        ACCUMULATE_WEIGHTED_GRAIN(0);
+        ACCUMULATE_WEIGHTED_GRAIN(1);
+        ACCUMULATE_WEIGHTED_GRAIN(2);
+        // The horizontal |auto_regression_coeff_lag| loop is replaced with
+        // if-statements to give vextq_s16 an immediate param.
+        if (auto_regression_coeff_lag > 1) {
+          ACCUMULATE_WEIGHTED_GRAIN(3);
+          ACCUMULATE_WEIGHTED_GRAIN(4);
+        }
+        if (auto_regression_coeff_lag > 2) {
+          assert(auto_regression_coeff_lag == 3);
+          ACCUMULATE_WEIGHTED_GRAIN(5);
+          ACCUMULATE_WEIGHTED_GRAIN(6);
+        }
+      }
+      // At this point in the filter, the source addresses and destination
+      // addresses overlap. Because this is an auto-regressive filter, the
+      // higher lanes cannot be computed without the results of the lower lanes.
+      // Each call to WriteFinalAutoRegression incorporates preceding values
+      // on the final row, and writes a single sample. This allows the next
+      // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane)                             \
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
+      luma_grain + x, sum, auto_regression_coeff_y, pos,               \
+      auto_regression_shift)
+
+      WRITE_AUTO_REGRESSION_RESULT(0);
+      WRITE_AUTO_REGRESSION_RESULT(1);
+      WRITE_AUTO_REGRESSION_RESULT(2);
+      WRITE_AUTO_REGRESSION_RESULT(3);
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+      WRITE_AUTO_REGRESSION_RESULT(6);
+      WRITE_AUTO_REGRESSION_RESULT(7);
+      x += 8;
+      // Leave the final four pixels for the special iteration below.
+    } while (x < kLumaWidth - kAutoRegressionBorder - 4);
+
+    // Final 4 pixels in the row.
+    int pos = 0;
+    int32x4x2_t sum;
+    SetZero(&sum);
+    for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+         ++delta_row) {
+      const int16x8_t src_grain_lo = GetSignedSource8(
+          luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
+      const int16x8_t src_grain_hi =
+          GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                           auto_regression_coeff_lag + 8);
+
+      ACCUMULATE_WEIGHTED_GRAIN(0);
+      ACCUMULATE_WEIGHTED_GRAIN(1);
+      ACCUMULATE_WEIGHTED_GRAIN(2);
+      // The horizontal |auto_regression_coeff_lag| loop is replaced with
+      // if-statements to give vextq_s16 an immediate param.
+      if (auto_regression_coeff_lag > 1) {
+        ACCUMULATE_WEIGHTED_GRAIN(3);
+        ACCUMULATE_WEIGHTED_GRAIN(4);
+      }
+      if (auto_regression_coeff_lag > 2) {
+        assert(auto_regression_coeff_lag == 3);
+        ACCUMULATE_WEIGHTED_GRAIN(5);
+        ACCUMULATE_WEIGHTED_GRAIN(6);
+      }
+    }
+    // delta_row == 0
+    WRITE_AUTO_REGRESSION_RESULT(0);
+    WRITE_AUTO_REGRESSION_RESULT(1);
+    WRITE_AUTO_REGRESSION_RESULT(2);
+    WRITE_AUTO_REGRESSION_RESULT(3);
+    luma_grain += kLumaWidth;
+  } while (++y < kLumaHeight);
+
+#undef WRITE_AUTO_REGRESSION_RESULT
+#undef ACCUMULATE_WEIGHTED_GRAIN
+}
+
+void InitializeScalingLookupTable_NEON(
+    int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
+    uint8_t scaling_lut[kScalingLookupTableSize]) {
+  if (num_points == 0) {
+    memset(scaling_lut, 0, sizeof(scaling_lut[0]) * kScalingLookupTableSize);
+    return;
+  }
+  static_assert(sizeof(scaling_lut[0]) == 1, "");
+  memset(scaling_lut, point_scaling[0], point_value[0]);
+  const uint32x4_t steps = vmovl_u16(vcreate_u16(0x0003000200010000));
+  const uint32x4_t offset = vdupq_n_u32(32768);
+  for (int i = 0; i < num_points - 1; ++i) {
+    const int delta_y = point_scaling[i + 1] - point_scaling[i];
+    const int delta_x = point_value[i + 1] - point_value[i];
+    const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+    const int delta4 = delta << 2;
+    const uint8x8_t base_point = vdup_n_u8(point_scaling[i]);
+    uint32x4_t upscaled_points0 = vmlaq_n_u32(offset, steps, delta);
+    const uint32x4_t line_increment4 = vdupq_n_u32(delta4);
+    // Get the second set of 4 points by adding 4 steps to the first set.
+    uint32x4_t upscaled_points1 = vaddq_u32(upscaled_points0, line_increment4);
+    // We obtain the next set of 8 points by adding 8 steps to each of the
+    // current 8 points.
+    const uint32x4_t line_increment8 = vshlq_n_u32(line_increment4, 1);
+    int x = 0;
+    do {
+      const uint16x4_t interp_points0 = vshrn_n_u32(upscaled_points0, 16);
+      const uint16x4_t interp_points1 = vshrn_n_u32(upscaled_points1, 16);
+      const uint8x8_t interp_points =
+          vmovn_u16(vcombine_u16(interp_points0, interp_points1));
+      // The spec guarantees that the max value of |point_value[i]| + x is 255.
+      // Writing 8 bytes starting at the final table byte, leaves 7 bytes of
+      // required padding.
+      vst1_u8(&scaling_lut[point_value[i] + x],
+              vadd_u8(interp_points, base_point));
+      upscaled_points0 = vaddq_u32(upscaled_points0, line_increment8);
+      upscaled_points1 = vaddq_u32(upscaled_points1, line_increment8);
+      x += 8;
+    } while (x < delta_x);
+  }
+  const uint8_t last_point_value = point_value[num_points - 1];
+  memset(&scaling_lut[last_point_value], point_scaling[num_points - 1],
+         kScalingLookupTableSize - last_point_value);
+}
+
+inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
+                       const int16x8_t high) {
+  const int16x8_t clipped_to_ceiling = vminq_s16(high, value);
+  return vmaxq_s16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline int16x8_t GetScalingFactors(
+    const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+  int16_t start_vals[8];
+  if (bitdepth == 8) {
+    start_vals[0] = scaling_lut[source[0]];
+    start_vals[1] = scaling_lut[source[1]];
+    start_vals[2] = scaling_lut[source[2]];
+    start_vals[3] = scaling_lut[source[3]];
+    start_vals[4] = scaling_lut[source[4]];
+    start_vals[5] = scaling_lut[source[5]];
+    start_vals[6] = scaling_lut[source[6]];
+    start_vals[7] = scaling_lut[source[7]];
+    return vld1q_s16(start_vals);
+  }
+  int16_t end_vals[8];
+  // TODO(petersonab): Precompute this into a larger table for direct lookups.
+  int index = source[0] >> 2;
+  start_vals[0] = scaling_lut[index];
+  end_vals[0] = scaling_lut[index + 1];
+  index = source[1] >> 2;
+  start_vals[1] = scaling_lut[index];
+  end_vals[1] = scaling_lut[index + 1];
+  index = source[2] >> 2;
+  start_vals[2] = scaling_lut[index];
+  end_vals[2] = scaling_lut[index + 1];
+  index = source[3] >> 2;
+  start_vals[3] = scaling_lut[index];
+  end_vals[3] = scaling_lut[index + 1];
+  index = source[4] >> 2;
+  start_vals[4] = scaling_lut[index];
+  end_vals[4] = scaling_lut[index + 1];
+  index = source[5] >> 2;
+  start_vals[5] = scaling_lut[index];
+  end_vals[5] = scaling_lut[index + 1];
+  index = source[6] >> 2;
+  start_vals[6] = scaling_lut[index];
+  end_vals[6] = scaling_lut[index + 1];
+  index = source[7] >> 2;
+  start_vals[7] = scaling_lut[index];
+  end_vals[7] = scaling_lut[index + 1];
+  const int16x8_t start = vld1q_s16(start_vals);
+  const int16x8_t end = vld1q_s16(end_vals);
+  int16x8_t remainder = GetSignedSource8(source);
+  remainder = vandq_s16(remainder, vdupq_n_s16(3));
+  const int16x8_t delta = vmulq_s16(vsubq_s16(end, start), remainder);
+  return vaddq_s16(start, vrshrq_n_s16(delta, 2));
+}
+
+inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
+                            const int16x8_t scaling_shift_vect) {
+  const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
+  return vrshlq_s16(upscaled_noise, scaling_shift_vect);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
+                            const int32x4_t scaling_shift_vect) {
+  // TODO(petersonab): Try refactoring scaling lookup table to int16_t and
+  // upscaling by 7 bits to permit high half multiply. This would eliminate
+  // the intermediate 32x4 registers. Also write the averaged values directly
+  // into the table so it doesn't have to be done for every pixel in
+  // the frame.
+  const int32x4_t upscaled_noise_lo =
+      vmull_s16(vget_low_s16(noise), vget_low_s16(scaling));
+  const int32x4_t upscaled_noise_hi =
+      vmull_s16(vget_high_s16(noise), vget_high_s16(scaling));
+  const int16x4_t noise_lo =
+      vmovn_s32(vrshlq_s32(upscaled_noise_lo, scaling_shift_vect));
+  const int16x4_t noise_hi =
+      vmovn_s32(vrshlq_s32(upscaled_noise_hi, scaling_shift_vect));
+  return vcombine_s16(noise_lo, noise_hi);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_NEON(
+    const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
+    int width, int height, int start_height,
+    const uint8_t scaling_lut_y[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
+    ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_luma);
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift);
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // This operation on the unsigned input is safe in 8bpp because the vector
+      // is widened before it is reinterpreted.
+      const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
+      const int16x8_t scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+      int16x8_t noise =
+          GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+      if (bitdepth == 8) {
+        noise = ScaleNoise(noise, scaling, scaling_shift_vect16);
+      } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        noise = ScaleNoise(noise, scaling, scaling_shift_vect32);
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+      }
+      const int16x8_t combined = vaddq_s16(orig, noise);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case, though the gain would be very small.
+      StoreUnsigned8(&out_y_row[x],
+                     vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+      x += 8;
+    } while (x < width);
+    in_y_row += source_stride_y;
+    out_y_row += dest_stride_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline int16x8_t BlendChromaValsWithCfl(
+    const Pixel* average_luma_buffer,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+    const int16x8_t scaling_shift_vect16,
+    const int32x4_t scaling_shift_vect32) {
+  const int16x8_t scaling =
+      GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+  const int16x8_t orig = GetSignedSource8(chroma_cursor);
+  int16x8_t noise = GetSignedSource8(noise_image_cursor);
+  if (bitdepth == 8) {
+    noise = ScaleNoise(noise, scaling, scaling_shift_vect16);
+  } else {
+    noise = ScaleNoise(noise, scaling, scaling_shift_vect32);
+  }
+  return vaddq_s16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
+    const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift,
+    const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
+    ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
+    ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
+    ptrdiff_t dest_stride) {
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+  Pixel luma_buffer[16];
+  memset(luma_buffer, 0, sizeof(luma_buffer));
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift);
+  const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int safe_chroma_width = chroma_width & ~7;
+
+  // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+  // in GetScalingFactors.
+  Pixel average_luma_buffer[8];
+  assert(start_height % 2 == 0);
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      // TODO(petersonab): Consider specializing by subsampling_x. In the 444
+      // case &in_y_row[x] can be passed to GetScalingFactors directly.
+      const uint16x8_t average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      StoreUnsigned8(average_luma_buffer, average_luma);
+
+      const int16x8_t blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), scaling_shift_vect16,
+              scaling_shift_vect32);
+
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+      x += 8;
+    } while (x < safe_chroma_width);
+
+    if (x < chroma_width) {
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const uint16x8_t average_luma =
+          GetAverageLuma(luma_buffer, subsampling_x);
+      StoreUnsigned8(average_luma_buffer, average_luma);
+
+      const int16x8_t blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), scaling_shift_vect16,
+              scaling_shift_vect32);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_NEON(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_chroma, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+  // Looping over one plane at a time is faster in higher resolutions, despite
+  // re-computing luma.
+  BlendChromaPlaneWithCfl_NEON<bitdepth, GrainType, Pixel>(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+      source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline int16x8_t BlendChromaValsNoCfl(
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const uint8_t* chroma_cursor, const int8_t* noise_image_cursor,
+    const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+    const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) {
+  uint8_t merged_buffer[8];
+  const int16x8_t orig = GetSignedSource8(chroma_cursor);
+  const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
+  const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
+  // Maximum value of |combined_u| is 127*255 = 0x7E81.
+  const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma);
+  // Maximum value of u_offset is (255 << 5) = 0x1FE0.
+  // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
+  const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
+  vst1_u8(merged_buffer, merged);
+  const int16x8_t scaling =
+      GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+  int16x8_t noise = GetSignedSource8(noise_image_cursor);
+  noise = ScaleNoise(noise, scaling, scaling_shift_vect);
+  return vaddq_s16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
+    const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier,
+    const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
+    ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
+    ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
+    ptrdiff_t dest_stride) {
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int safe_chroma_width = chroma_width & ~7;
+  uint8_t luma_buffer[16];
+  const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int16x8_t average_luma = vreinterpretq_s16_u16(
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x));
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can
+      // replace clipping with vqmovun_s16, but the gain would be small.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+
+      x += 8;
+    } while (x < safe_chroma_width);
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+
+      const int16x8_t average_luma =
+          vreinterpretq_s16_u16(GetAverageLuma(luma_buffer, subsampling_x));
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier);
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+      // End of right edge iteration.
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_NEON(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_chroma, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width,
+                            height, start_height, subsampling_x, subsampling_y,
+                            params.chroma_scaling, offset, multiplier,
+                            luma_multiplier, scaling_lut, in_y, source_stride_y,
+                            in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+inline void WriteOverlapLine8bpp_NEON(const int8_t* noise_stripe_row,
+                                      const int8_t* noise_stripe_row_prev,
+                                      int plane_width,
+                                      const int8x8_t grain_coeff,
+                                      const int8x8_t old_coeff,
+                                      int8_t* noise_image_row) {
+  int x = 0;
+  do {
+    // Note that these reads may exceed noise_stripe_row's width by up to 7
+    // bytes.
+    const int8x8_t source_grain = vld1_s8(noise_stripe_row + x);
+    const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x);
+    const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain);
+    const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old);
+    // Note that this write may exceed noise_image_row's width by up to 7 bytes.
+    vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5));
+    x += 8;
+  } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap8bpp_NEON(const void* noise_stripes_buffer,
+                                         int width, int height,
+                                         int subsampling_x, int subsampling_y,
+                                         void* noise_image_buffer) {
+  const auto* noise_stripes =
+      static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
+  auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = stripe_height;
+  int luma_num = 1;
+  if (subsampling_y == 0) {
+    const int8x8_t first_row_grain_coeff = vdup_n_s8(17);
+    const int8x8_t first_row_old_coeff = vdup_n_s8(27);
+    const int8x8_t second_row_grain_coeff = first_row_old_coeff;
+    const int8x8_t second_row_old_coeff = first_row_grain_coeff;
+    for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+      const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine8bpp_NEON(
+          noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+      WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+                                &noise_stripe_prev[(32 + 1) * plane_width],
+                                plane_width, second_row_grain_coeff,
+                                second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+    // Either one partial stripe remains (remaining_height  > 0),
+    // OR image is less than one stripe high (remaining_height < 0),
+    // OR all stripes are completed (remaining_height == 0).
+    const int remaining_height = plane_height - y;
+    if (remaining_height <= 0) {
+      return;
+    }
+    const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+    const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+    WriteOverlapLine8bpp_NEON(
+        noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+        first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+    if (remaining_height > 1) {
+      WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+                                &noise_stripe_prev[(32 + 1) * plane_width],
+                                plane_width, second_row_grain_coeff,
+                                second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+  } else {  // subsampling_y == 1
+    const int8x8_t first_row_grain_coeff = vdup_n_s8(22);
+    const int8x8_t first_row_old_coeff = vdup_n_s8(23);
+    for (; y < plane_height; ++luma_num, y += stripe_height) {
+      const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine8bpp_NEON(
+          noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 1>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 2>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 3>;
+
+  // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag]
+  // Chroma autoregression should never be called when lag is 0 and use_luma
+  // is false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, true>;
+
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap8bpp_NEON;
+
+  dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON;
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_NEON<8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_NEON<8, int8_t, uint8_t>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 1>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 2>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 3>;
+
+  // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling]
+  // Chroma autoregression should never be called when lag is 0 and use_luma
+  // is false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, true>;
+
+  dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON;
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_NEON<10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_NEON<10, int16_t, uint16_t>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace film_grain
+
+void FilmGrainInit_NEON() {
+  film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h
new file mode 100644
index 0000000..44b3d1d
--- /dev/null
+++ b/src/dsp/arm/film_grain_neon.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc
new file mode 100644
index 0000000..00b186a
--- /dev/null
+++ b/src/dsp/arm/intra_edge_neon.cc
@@ -0,0 +1,301 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"  // RightShiftWithRounding()
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Simplified version of intra_edge.cc:kKernels[][]. Only |strength| 1 and 2 are
+// required.
+constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+  assert(strength == 1 || strength == 2 || strength == 3);
+  const int kernel_index = strength - 1;
+  auto* const dst_buffer = static_cast<uint8_t*>(buffer);
+
+  // The first element is not written out (but it is input) so the number of
+  // elements written is |size| - 1.
+  if (size == 1) return;
+
+  // |strength| 1 and 2 use a 3 tap filter.
+  if (strength < 3) {
+    // The last value requires extending the buffer (duplicating
+    // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+    // neon.
+    const uint8_t last_val = RightShiftWithRounding(
+        kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+            kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+            kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+        4);
+
+    const uint8x8_t krn1 = vdup_n_u8(kKernelsNEON[kernel_index][1]);
+
+    // The first value we need gets overwritten by the output from the
+    // previous iteration.
+    uint8x16_t src_0 = vld1q_u8(dst_buffer);
+    int i = 1;
+
+    // Process blocks until there are less than 16 values remaining.
+    for (; i < size - 15; i += 16) {
+      // Loading these at the end of the block with |src_0| will read past the
+      // end of |top_row_data[160]|, the source of |buffer|.
+      const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+      const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+      uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+      sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+      sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+      uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+      sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+      sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+      const uint8x16_t result =
+          vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+      // Load the next row before overwriting. This loads an extra 15 values
+      // past |size| on the trailing iteration.
+      src_0 = vld1q_u8(dst_buffer + i + 15);
+
+      vst1q_u8(dst_buffer + i, result);
+    }
+
+    // The last output value |last_val| was already calculated so if
+    // |remainder| == 1 then we don't have to do anything.
+    const int remainder = (size - 1) & 0xf;
+    if (remainder > 1) {
+      uint8_t temp[16];
+      const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+      const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+
+      uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+      sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+      sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+      uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+      sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+      sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+      const uint8x16_t result =
+          vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+      vst1q_u8(temp, result);
+      memcpy(dst_buffer + i, temp, remainder);
+    }
+
+    dst_buffer[size - 1] = last_val;
+    return;
+  }
+
+  assert(strength == 3);
+  // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+  // last two elements require duplicating |buffer[size - 1]|.
+  uint8_t special_vals[3];
+  special_vals[0] = RightShiftWithRounding(
+      (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+          (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+      4);
+  // Clamp index for very small |size| values.
+  const int first_index_min = std::max(size - 4, 0);
+  const int second_index_min = std::max(size - 3, 0);
+  const int third_index_min = std::max(size - 2, 0);
+  special_vals[1] = RightShiftWithRounding(
+      (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+          (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+          (dst_buffer[size - 1] << 1),
+      4);
+  special_vals[2] = RightShiftWithRounding(
+      (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+          // x << 2 + x << 2 == x << 3
+          (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+      4);
+
+  // The first two values we need get overwritten by the output from the
+  // previous iteration.
+  uint8x16_t src_0 = vld1q_u8(dst_buffer - 1);
+  uint8x16_t src_1 = vld1q_u8(dst_buffer);
+  int i = 1;
+
+  for (; i < size - 15; i += 16) {
+    // Loading these at the end of the block with |src_[01]| will read past
+    // the end of |top_row_data[160]|, the source of |buffer|.
+    const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+    const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+    const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+    uint16x8_t sum_lo =
+        vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+    const uint16x8_t sum_123_lo = vaddw_u8(
+        vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+    sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+    uint16x8_t sum_hi =
+        vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+    const uint16x8_t sum_123_hi =
+        vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+                 vget_high_u8(src_3));
+    sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+    const uint8x16_t result =
+        vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+    src_0 = vld1q_u8(dst_buffer + i + 14);
+    src_1 = vld1q_u8(dst_buffer + i + 15);
+
+    vst1q_u8(dst_buffer + i, result);
+  }
+
+  const int remainder = (size - 1) & 0xf;
+  // Like the 3 tap but if there are two remaining values we have already
+  // calculated them.
+  if (remainder > 2) {
+    uint8_t temp[16];
+    const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+    const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+    const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+    uint16x8_t sum_lo =
+        vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+    const uint16x8_t sum_123_lo = vaddw_u8(
+        vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+    sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+    uint16x8_t sum_hi =
+        vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+    const uint16x8_t sum_123_hi =
+        vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+                 vget_high_u8(src_3));
+    sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+    const uint8x16_t result =
+        vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+    vst1q_u8(temp, result);
+    memcpy(dst_buffer + i, temp, remainder);
+  }
+
+  dst_buffer[1] = special_vals[0];
+  // Avoid overwriting |dst_buffer[0]|.
+  if (size > 2) dst_buffer[size - 2] = special_vals[1];
+  dst_buffer[size - 1] = special_vals[2];
+}
+
+// (-|src0| + |src1| * 9 + |src2| * 9 - |src3|) >> 4
+uint8x8_t Upsample(const uint8x8_t src0, const uint8x8_t src1,
+                   const uint8x8_t src2, const uint8x8_t src3) {
+  const uint16x8_t middle = vmulq_n_u16(vaddl_u8(src1, src2), 9);
+  const uint16x8_t ends = vaddl_u8(src0, src3);
+  const int16x8_t sum =
+      vsubq_s16(vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(ends));
+  return vqrshrun_n_s16(sum, 4);
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+  assert(size % 4 == 0 && size <= 16);
+  auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+  // This is OK because we don't read this value for |size| 4 or 8 but if we
+  // write |pixel_buffer[size]| and then vld() it, that seems to introduce
+  // some latency.
+  pixel_buffer[-2] = pixel_buffer[-1];
+  if (size == 4) {
+    // This uses one load and two vtbl() which is better than 4x Load{Lo,Hi}4().
+    const uint8x8_t src = vld1_u8(pixel_buffer - 1);
+    // The outside values are negated so put those in the same vector.
+    const uint8x8_t src03 = vtbl1_u8(src, vcreate_u8(0x0404030202010000));
+    // Reverse |src1| and |src2| so we can use |src2| for the interleave at the
+    // end.
+    const uint8x8_t src21 = vtbl1_u8(src, vcreate_u8(0x0302010004030201));
+
+    const uint16x8_t middle = vmull_u8(src21, vdup_n_u8(9));
+    const int16x8_t half_sum = vsubq_s16(
+        vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(vmovl_u8(src03)));
+    const int16x4_t sum =
+        vadd_s16(vget_low_s16(half_sum), vget_high_s16(half_sum));
+    const uint8x8_t result = vqrshrun_n_s16(vcombine_s16(sum, sum), 4);
+
+    vst1_u8(pixel_buffer - 1, InterleaveLow8(result, src21));
+    return;
+  } else if (size == 8) {
+    // Likewise, one load + multiple vtbls seems preferred to multiple loads.
+    const uint8x16_t src = vld1q_u8(pixel_buffer - 1);
+    const uint8x8_t src0 = VQTbl1U8(src, vcreate_u8(0x0605040302010000));
+    const uint8x8_t src1 = vget_low_u8(src);
+    const uint8x8_t src2 = VQTbl1U8(src, vcreate_u8(0x0807060504030201));
+    const uint8x8_t src3 = VQTbl1U8(src, vcreate_u8(0x0808070605040302));
+
+    const uint8x8x2_t output = {Upsample(src0, src1, src2, src3), src2};
+    vst2_u8(pixel_buffer - 1, output);
+    return;
+  }
+  assert(size == 12 || size == 16);
+  // Extend the input borders to avoid branching later.
+  pixel_buffer[size] = pixel_buffer[size - 1];
+  const uint8x16_t src0 = vld1q_u8(pixel_buffer - 2);
+  const uint8x16_t src1 = vld1q_u8(pixel_buffer - 1);
+  const uint8x16_t src2 = vld1q_u8(pixel_buffer);
+  const uint8x16_t src3 = vld1q_u8(pixel_buffer + 1);
+
+  const uint8x8_t result_lo = Upsample(vget_low_u8(src0), vget_low_u8(src1),
+                                       vget_low_u8(src2), vget_low_u8(src3));
+
+  const uint8x8x2_t output_lo = {result_lo, vget_low_u8(src2)};
+  vst2_u8(pixel_buffer - 1, output_lo);
+
+  const uint8x8_t result_hi = Upsample(vget_high_u8(src0), vget_high_u8(src1),
+                                       vget_high_u8(src2), vget_high_u8(src3));
+
+  if (size == 12) {
+    vst1_u8(pixel_buffer + 15, InterleaveLow8(result_hi, vget_high_u8(src2)));
+  } else /* size == 16 */ {
+    const uint8x8x2_t output_hi = {result_hi, vget_high_u8(src2)};
+    vst2_u8(pixel_buffer + 15, output_hi);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+}  // namespace
+
+void IntraEdgeInit_NEON() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h
new file mode 100644
index 0000000..d3bb243
--- /dev/null
+++ b/src/dsp/arm/intra_edge_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
new file mode 100644
index 0000000..45fe33b
--- /dev/null
+++ b/src/dsp/arm/intrapred_cfl_neon.cc
@@ -0,0 +1,479 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+uint8x16_t Set2ValuesQ(const uint8_t* a) {
+  uint16_t combined_values = a[0] | a[1] << 8;
+  return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
+}
+
+uint32_t SumVector(uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif  // defined(__aarch64__)
+}
+
+uint32_t SumVector(uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+  return vget_lane_u32(vreinterpret_u32_u64(c), 0);
+#endif  // defined(__aarch64__)
+}
+
+// Divide by the number of elements.
+uint32_t Average(const uint32_t sum, const int width, const int height) {
+  return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
+}
+
+// Subtract |val| from every element in |a|.
+void BlockSubtract(const uint32_t val,
+                   int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+                   const int width, const int height) {
+  assert(val <= INT16_MAX);
+  const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
+
+  for (int y = 0; y < height; ++y) {
+    if (width == 4) {
+      const int16x4_t b = vld1_s16(a[y]);
+      vst1_s16(a[y], vsub_s16(b, vget_low_s16(val_v)));
+    } else if (width == 8) {
+      const int16x8_t b = vld1q_s16(a[y]);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+    } else if (width == 16) {
+      const int16x8_t b = vld1q_s16(a[y]);
+      const int16x8_t c = vld1q_s16(a[y] + 8);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+      vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+    } else /* block_width == 32 */ {
+      const int16x8_t b = vld1q_s16(a[y]);
+      const int16x8_t c = vld1q_s16(a[y] + 8);
+      const int16x8_t d = vld1q_s16(a[y] + 16);
+      const int16x8_t e = vld1q_s16(a[y] + 24);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+      vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+      vst1q_s16(a[y] + 16, vsubq_s16(d, val_v));
+      vst1q_s16(a[y] + 24, vsubq_s16(e, val_v));
+    }
+  }
+}
+
+template <int block_width, int block_height>
+void CflSubsampler420_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, const ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t sum;
+  if (block_width == 4) {
+    assert(max_luma_width >= 8);
+    uint32x2_t running_sum = vdup_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x8_t row0 = vld1_u8(src);
+      const uint8x8_t row1 = vld1_u8(src + stride);
+
+      uint16x4_t sum_row = vpadal_u8(vpaddl_u8(row0), row1);
+      sum_row = vshl_n_u16(sum_row, 1);
+      running_sum = vpadal_u16(running_sum, sum_row);
+      vst1_s16(luma[y], vreinterpret_s16_u16(sum_row));
+
+      if (y << 1 < max_luma_height - 2) {
+        // Once this threshold is reached the loop could be simplified.
+        src += stride << 1;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else if (block_width == 8) {
+    const uint8x16_t x_index = {0, 0, 2,  2,  4,  4,  6,  6,
+                                8, 8, 10, 10, 12, 12, 14, 14};
+    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+    const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
+      const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);
+
+      uint8x16_t row0 = vld1q_u8(src);
+      row0 = vbslq_u8(x_mask, row0, x_max0);
+      uint8x16_t row1 = vld1q_u8(src + stride);
+      row1 = vbslq_u8(x_mask, row1, x_max1);
+
+      uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+      sum_row = vshlq_n_u16(sum_row, 1);
+      running_sum = vpadalq_u16(running_sum, sum_row);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));
+
+      if (y << 1 < max_luma_height - 2) {
+        src += stride << 1;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else /* block_width >= 16 */ {
+    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      uint8x16_t x_index = {0,  2,  4,  6,  8,  10, 12, 14,
+                            16, 18, 20, 22, 24, 26, 28, 30};
+      const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
+      const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
+      const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
+      const uint8x16_t x_max11 =
+          vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
+      for (int x = 0; x < block_width; x += 16) {
+        const ptrdiff_t src_x_offset = x << 1;
+        const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+        const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
+        const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
+        const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
+        const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
+        const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
+        const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);
+
+        uint16x8_t sum_row_lo =
+            vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
+        sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
+        sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
+        sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
+        running_sum = vpadalq_u16(running_sum, sum_row_lo);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));
+
+        uint16x8_t sum_row_hi =
+            vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
+        sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
+        sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
+        sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
+        running_sum = vpadalq_u16(running_sum, sum_row_hi);
+        vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));
+
+        x_index = vaddq_u8(x_index, vdupq_n_u8(32));
+      }
+      if (y << 1 < max_luma_height - 2) {
+        src += stride << 1;
+      }
+    }
+    sum = SumVector(running_sum);
+  }
+
+  const uint32_t average = Average(sum, block_width, block_height);
+  BlockSubtract(average, luma, block_width, block_height);
+}
+
+template <int block_width, int block_height>
+void CflSubsampler444_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, const ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t sum;
+  if (block_width == 4) {
+    assert(max_luma_width >= 4);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+    uint8x8_t row = vdup_n_u8(0);
+
+    for (int y = 0; y < block_height; y += 2) {
+      row = Load4<0>(src, row);
+      row = Load4<1>(src + stride, row);
+      if (y < (max_luma_height - 1)) {
+        src += stride << 1;
+      }
+
+      const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+      vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+    }
+
+    sum = SumVector(running_sum);
+  } else if (block_width == 8) {
+    const uint8x8_t x_index = {0, 1, 2, 3, 4, 5, 6, 7};
+    const uint8x8_t x_max_index = vdup_n_u8(max_luma_width - 1);
+    const uint8x8_t x_mask = vclt_u8(x_index, x_max_index);
+
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x8_t x_max = vdup_n_u8(src[max_luma_width - 1]);
+      const uint8x8_t row = vbsl_u8(x_mask, vld1_u8(src), x_max);
+
+      const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(row_shifted));
+
+      if (y < max_luma_height - 1) {
+        src += stride;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else /* block_width >= 16 */ {
+    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 1);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      uint8x16_t x_index = {0, 1, 2,  3,  4,  5,  6,  7,
+                            8, 9, 10, 11, 12, 13, 14, 15};
+      const uint8x16_t x_max = vdupq_n_u8(src[max_luma_width - 1]);
+      for (int x = 0; x < block_width; x += 16) {
+        const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+        const uint8x16_t row = vbslq_u8(x_mask, vld1q_u8(src + x), x_max);
+
+        const uint16x8_t row_shifted_low = vshll_n_u8(vget_low_u8(row), 3);
+        const uint16x8_t row_shifted_high = vshll_n_u8(vget_high_u8(row), 3);
+        running_sum = vpadalq_u16(running_sum, row_shifted_low);
+        running_sum = vpadalq_u16(running_sum, row_shifted_high);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(row_shifted_low));
+        vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(row_shifted_high));
+
+        x_index = vaddq_u8(x_index, vdupq_n_u8(16));
+      }
+      if (y < max_luma_height - 1) {
+        src += stride;
+      }
+    }
+    sum = SumVector(running_sum);
+  }
+
+  const uint32_t average = Average(sum, block_width, block_height);
+  BlockSubtract(average, luma, block_width, block_height);
+}
+
+// Saturate |dc + ((alpha * luma) >> 6))| to uint8_t.
+inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
+                          const int16x8_t dc) {
+  const int16x8_t la = vmulq_n_s16(luma, alpha);
+  // Subtract the sign bit to round towards zero.
+  const int16x8_t sub_sign = vsraq_n_s16(la, la, 15);
+  // Shift and accumulate.
+  const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6);
+  return vqmovun_s16(result);
+}
+
+// The range of luma/alpha is not really important because it gets saturated to
+// uint8_t. Saturated int16_t >> 6 outranges uint8_t.
+template <int block_height>
+inline void CflIntraPredictor4xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; y += 2) {
+    const int16x4_t luma_row0 = vld1_s16(luma[y]);
+    const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+    const uint8x8_t sum =
+        Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc);
+    StoreLo4(dst, sum);
+    dst += stride;
+    StoreHi4(dst, sum);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor8xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row = vld1q_s16(luma[y]);
+    const uint8x8_t sum = Combine8(luma_row, alpha, dc);
+    vst1_u8(dst, sum);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor16xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+    const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+    vst1_u8(dst, sum_0);
+    vst1_u8(dst + 8, sum_1);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor32xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+    const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+    const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+    const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+    const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc);
+    const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc);
+    vst1_u8(dst, sum_0);
+    vst1_u8(dst + 8, sum_1);
+    vst1_u8(dst + 16, sum_2);
+    vst1_u8(dst + 24, sum_3);
+    dst += stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 8>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 16>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 4>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 8>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 16>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 8>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 16>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 8>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 16>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 8>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 16>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 4>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 8>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 16>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 8>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 16>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 8>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 16>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 32>;
+
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor16xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor16xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor32xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor32xN_NEON<32>;
+  // Max Cfl predictor size is 32x32.
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
new file mode 100644
index 0000000..805ba81
--- /dev/null
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -0,0 +1,926 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>  // std::min
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>  // memset
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Blend two values based on a 32 bit weight.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+                               const uint8x8_t a_weight,
+                               const uint8x8_t b_weight) {
+  const uint16x8_t a_product = vmull_u8(a, a_weight);
+  const uint16x8_t b_product = vmull_u8(b, b_weight);
+
+  return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+}
+
+// For vertical operations the weights are one constant value.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+                               const uint8_t weight) {
+  return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight));
+}
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step,
+                         const uint8x8_t right_step, uint8x8_t* left,
+                         uint8x8_t* right) {
+  const uint8x16_t mixed = vld1q_u8(source);
+  *left = VQTbl1U8(mixed, left_step);
+  *right = VQTbl1U8(mixed, right_step);
+}
+
+// Handle signed step arguments by ignoring the sign. Negative values are
+// considered out of range and overwritten later.
+inline void LoadStepwise(const uint8_t* const source, const int8x8_t left_step,
+                         const int8x8_t right_step, uint8x8_t* left,
+                         uint8x8_t* right) {
+  LoadStepwise(source, vreinterpret_u8_s8(left_step),
+               vreinterpret_u8_s8(right_step), left, right);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
+                                 const int height, const uint8_t* const top,
+                                 const int xstep, const bool upsampled) {
+  assert(width == 4 || width == 8);
+
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (width + height - 1) << upsample_shift;
+  const int8x8_t max_base = vdup_n_s8(max_base_x);
+  const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+  const int8x8_t all = vcreate_s8(0x0706050403020100);
+  const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+  const int8x8_t base_step = upsampled ? even : all;
+  const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+
+  int top_x = xstep;
+  int y = 0;
+  do {
+    const int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dst, top[max_base_x], 4 /* width */);
+        dst += stride;
+      }
+      return;
+    }
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+    // Zone2 uses negative values for xstep. Use signed values to compare
+    // |top_base_x| to |max_base_x|.
+    const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+    const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+    // 4 wide subsamples the output. 8 wide subsamples the input.
+    if (width == 4) {
+      const uint8x8_t left_values = vld1_u8(top + top_base_x);
+      const uint8x8_t right_values = RightShift<8>(left_values);
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+
+      // If |upsampled| is true then extract every other value for output.
+      const uint8x8_t value_stepped =
+          vtbl1_u8(value, vreinterpret_u8_s8(base_step));
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value_stepped, top_max_base);
+
+      StoreLo4(dst, masked_value);
+    } else /* width == 8 */ {
+      uint8x8_t left_values, right_values;
+      // WeightedBlend() steps up to Q registers. Downsample the input to avoid
+      // doing extra calculations.
+      LoadStepwise(top + top_base_x, base_step, right_step, &left_values,
+                   &right_values);
+
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value, top_max_base);
+
+      vst1_u8(dst, masked_value);
+    }
+    dst += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
+                                 const int width, const int height,
+                                 const uint8_t* const top, const int xstep,
+                                 const bool upsampled) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (width + height - 1) << upsample_shift;
+  const int8x8_t max_base = vdup_n_s8(max_base_x);
+  const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+  const int8x8_t all = vcreate_s8(0x0706050403020100);
+  const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+  const int8x8_t base_step = upsampled ? even : all;
+  const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+  const int8x8_t block_step = vdup_n_s8(8 << upsample_shift);
+
+  int top_x = xstep;
+  int y = 0;
+  do {
+    const int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dst, top[max_base_x], 4 /* width */);
+        dst += stride;
+      }
+      return;
+    }
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+    // Zone2 uses negative values for xstep. Use signed values to compare
+    // |top_base_x| to |max_base_x|.
+    int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+    int x = 0;
+    do {
+      const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+      // Extract the input values based on |upsampled| here to avoid doing twice
+      // as many calculations.
+      uint8x8_t left_values, right_values;
+      LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values,
+                   &right_values);
+
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value, top_max_base);
+
+      vst1_u8(dst + x, masked_value);
+
+      base_v = vadd_s8(base_v, block_step);
+      x += 8;
+    } while (x < width);
+    top_x += xstep;
+    dst += stride;
+  } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest,
+                                         const ptrdiff_t stride,
+                                         const void* const top_row,
+                                         const int width, const int height,
+                                         const int xstep,
+                                         const bool upsampled_top) {
+  const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  assert(xstep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_top);
+
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+
+  if (xstep == 64) {
+    assert(!upsampled_top);
+    const uint8_t* top_ptr = top + 1;
+    int y = 0;
+    do {
+      memcpy(dst, top_ptr, width);
+      memcpy(dst + stride, top_ptr + 1, width);
+      memcpy(dst + 2 * stride, top_ptr + 2, width);
+      memcpy(dst + 3 * stride, top_ptr + 3, width);
+      dst += 4 * stride;
+      top_ptr += 4;
+      y += 4;
+    } while (y < height);
+  } else if (width == 4) {
+    DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top);
+  } else if (xstep > 51) {
+    // 7.11.2.10. Intra edge upsample selection process
+    // if ( d <= 0 || d >= 40 ) useUpsample = 0
+    // For |upsample_top| the delta is from vertical so |prediction_angle - 90|.
+    // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet
+    // this criteria. The |xstep| value for angle 51 happens to be 51 as well.
+    // Shallower angles have greater xstep values.
+    assert(!upsampled_top);
+    const int max_base_x = ((width + height) - 1);
+    const uint8x8_t max_base = vdup_n_u8(max_base_x);
+    const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+    const uint8x8_t block_step = vdup_n_u8(8);
+
+    int top_x = xstep;
+    int y = 0;
+    do {
+      const int top_base_x = top_x >> 6;
+      const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+      uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all);
+      int x = 0;
+      // Only calculate a block of 8 when at least one of the output values is
+      // within range. Otherwise it can read off the end of |top|.
+      const int must_calculate_width =
+          std::min(width, max_base_x - top_base_x + 7) & ~7;
+      for (; x < must_calculate_width; x += 8) {
+        const uint8x8_t max_base_mask = vclt_u8(base_v, max_base);
+
+        // Since these |xstep| values can not be upsampled the load is
+        // simplified.
+        const uint8x8_t left_values = vld1_u8(top + top_base_x + x);
+        const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1);
+        const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+        const uint8x8_t masked_value =
+            vbsl_u8(max_base_mask, value, top_max_base);
+
+        vst1_u8(dst + x, masked_value);
+        base_v = vadd_u8(base_v, block_step);
+      }
+      memset(dst + x, top[max_base_x], width - x);
+      dst += stride;
+      top_x += xstep;
+    } while (++y < height);
+  } else {
+    DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top);
+  }
+}
+
+// Process 4 or 8 |width| by 4 or 8 |height|.
+template <int width>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+                                 const int height,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep,
+                                 const int upsample_shift) {
+  assert(width == 4 || width == 8);
+  assert(height == 4 || height == 8);
+  const int scale_bits = 6 - upsample_shift;
+
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> scale_bits) +
+             (/* base_step */ 1 << upsample_shift) *
+                 (height - 1));  // left_base_y
+
+  // Limited improvement for 8x8. ~20% faster for 64x64.
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+  const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+  const uint8x8_t base_step = upsample_shift ? even : all;
+  const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+  uint8_t* dst = dest;
+  uint8x8_t left_v[8], right_v[8], value_v[8];
+  const uint8_t* const left = left_column;
+
+  const int index_0 = base_left_y;
+  LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step,
+               &left_v[0], &right_v[0]);
+  value_v[0] = WeightedBlend(left_v[0], right_v[0],
+                             ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_1 = base_left_y + ystep;
+  LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step,
+               &left_v[1], &right_v[1]);
+  value_v[1] = WeightedBlend(left_v[1], right_v[1],
+                             ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_2 = base_left_y + ystep * 2;
+  LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step,
+               &left_v[2], &right_v[2]);
+  value_v[2] = WeightedBlend(left_v[2], right_v[2],
+                             ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_3 = base_left_y + ystep * 3;
+  LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step,
+               &left_v[3], &right_v[3]);
+  value_v[3] = WeightedBlend(left_v[3], right_v[3],
+                             ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_4 = base_left_y + ystep * 4;
+  LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step,
+               &left_v[4], &right_v[4]);
+  value_v[4] = WeightedBlend(left_v[4], right_v[4],
+                             ((index_4 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_5 = base_left_y + ystep * 5;
+  LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step,
+               &left_v[5], &right_v[5]);
+  value_v[5] = WeightedBlend(left_v[5], right_v[5],
+                             ((index_5 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_6 = base_left_y + ystep * 6;
+  LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step,
+               &left_v[6], &right_v[6]);
+  value_v[6] = WeightedBlend(left_v[6], right_v[6],
+                             ((index_6 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_7 = base_left_y + ystep * 7;
+  LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step,
+               &left_v[7], &right_v[7]);
+  value_v[7] = WeightedBlend(left_v[7], right_v[7],
+                             ((index_7 << upsample_shift) & 0x3F) >> 1);
+
+  // 8x8 transpose.
+  const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]),
+                                   vcombine_u8(value_v[1], value_v[5]));
+  const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]),
+                                   vcombine_u8(value_v[3], value_v[7]));
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  if (width == 4) {
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+    if (height == 4) return;
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+  } else {
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+    if (height == 4) return;
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+  }
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst,
+                                            const ptrdiff_t stride,
+                                            const int height,
+                                            const uint8_t* const left_column,
+                                            const int16x8_t left_y,
+                                            const int upsample_shift) {
+  assert(width == 4 || width == 8);
+
+  // The shift argument must be a constant.
+  int16x8_t offset_y, shift_upsampled = left_y;
+  if (upsample_shift) {
+    offset_y = vshrq_n_s16(left_y, 5);
+    shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshrq_n_s16(left_y, 6);
+  }
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the right.
+  // With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by doing
+  // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+  // registers as input which would allow for cumulative offsets of 32.
+  const int16x8_t sampler =
+      vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset));
+  const uint8x8_t left_values = vqmovun_s16(sampler);
+  const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1));
+
+  const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+  const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1));
+  const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul);
+
+  int y = 0;
+  do {
+    uint8x8_t src_left, src_right;
+    LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift),
+                 left_values, right_values, &src_left, &src_right);
+    const uint8x8_t val =
+        WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul);
+
+    if (width == 4) {
+      StoreLo4(dst, val);
+    } else {
+      vst1_u8(dst, val);
+    }
+    dst += stride;
+  } while (++y < height);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1Blend_WxH(uint8_t* dest, const ptrdiff_t stride,
+                                      const int height,
+                                      const uint8_t* const top_row,
+                                      int zone_bounds, int top_x,
+                                      const int xstep,
+                                      const int upsample_shift) {
+  assert(width == 4 || width == 8);
+
+  const int scale_bits_x = 6 - upsample_shift;
+
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+  const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+  const uint8x8_t base_step = upsample_shift ? even : all;
+  const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+  int y = 0;
+  do {
+    const uint8_t* const src = top_row + (top_x >> scale_bits_x);
+    uint8x8_t left, right;
+    LoadStepwise(src, base_step, right_step, &left, &right);
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint8x8_t val = WeightedBlend(left, right, shift);
+
+    uint8x8_t dst_blend = vld1_u8(dest);
+    // |zone_bounds| values can be negative.
+    uint8x8_t blend =
+        vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6)));
+    uint8x8_t output = vbsl_u8(blend, val, dst_blend);
+
+    if (width == 4) {
+      StoreLo4(dest, output);
+    } else {
+      vst1_u8(dest, output);
+    }
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  } while (++y < height);
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in sections
+// that permit it.
+inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
+                                 const uint8_t* const top_row,
+                                 const uint8_t* const left_column,
+                                 const int height, const int xstep,
+                                 const int ystep, const bool upsampled_top,
+                                 const bool upsampled_left) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Helper vector.
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If
+  // it's only 4, it will be finished in the first iteration.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
+
+  // For steep angles, the source pixels from |left_column| may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  // TODO(johannkoenig): Revisit this for |width| == 4.
+  const int max_shuffle_height =
+      std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. The following values need the full ystep as a relative offset.
+  int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
+  left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  if (min_top_only_x > 0) {
+    // Round down to the nearest multiple of 8.
+    // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should.
+    const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+    DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
+                            upsampled_top);
+
+    if (max_top_only_y == height) return;
+
+    int y = max_top_only_y;
+    dst += stride * y;
+    const int xstep_y = xstep * y;
+
+    // All rows from |min_left_only_y| down for this set of columns only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min((4 << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    int xstep_bounds = xstep_bounds_base + xstep_y;
+    int top_x = -xstep - xstep_y;
+
+    // +8 increment is OK because if height is 4 this only goes once.
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_WxH<4>(
+          dst, stride, min_height,
+          left_column + ((y - left_base_increment) << upsample_left_shift),
+          left_y, upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
+                                   xstep_bounds, top_x, xstep,
+                                   upsample_top_shift);
+    }
+
+    // Pick up from the last y-value, using the slower but secure method for
+    // left prediction.
+    const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+    for (; y < min_left_only_y;
+         y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone3_WxH<4>(
+          dst, stride, min_height,
+          left_column + ((y - left_base_increment) << upsample_left_shift),
+          base_left_y, -ystep, upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
+                                   xstep_bounds, top_x, xstep,
+                                   upsample_top_shift);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst += stride8) {
+      DirectionalZone3_WxH<4>(
+          dst, stride, min_height,
+          left_column + ((y - left_base_increment) << upsample_left_shift),
+          base_left_y, -ystep, upsample_left_shift);
+    }
+  } else {
+    DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep,
+                            upsampled_top);
+  }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
+                               const uint8_t* const top_row,
+                               const uint8_t* const left_column,
+                               const int width, const int height,
+                               const int xstep, const int ystep,
+                               const bool upsampled_top,
+                               const bool upsampled_left) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Helper vector.
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  // Loop incrementers for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+  const int ystep8 = ystep << 3;
+
+  // Process Wx4 blocks.
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // For steep angles, the source pixels from |left_column| may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  const int max_shuffle_height =
+      std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
+  left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  int x = 0;
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+    DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+                            top_row + (x << upsample_top_shift), -xstep,
+                            upsampled_top);
+
+    if (max_top_only_y == height) continue;
+
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+
+    // All rows from |min_left_only_y| down for this set of columns only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    int xstep_bounds = xstep_bounds_base + xstep_y;
+    int top_x = -xstep - xstep_y;
+
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y,
+          upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<8>(
+          dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+          xstep_bounds, top_x, xstep, upsample_top_shift);
+    }
+
+    // Pick up from the last y-value, using the slower but secure method for
+    // left prediction.
+    const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+    for (; y < min_left_only_y;
+         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone3_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep, upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<8>(
+          dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+          xstep_bounds, top_x, xstep, upsample_top_shift);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst_x += stride8) {
+      DirectionalZone3_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep, upsample_left_shift);
+    }
+  }
+  // TODO(johannkoenig): May be able to remove this branch.
+  if (x < width) {
+    DirectionalZone1_WxH(dst + x, stride, width - x, height,
+                         top_row + (x << upsample_top_shift), -xstep,
+                         upsampled_top);
+  }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+    void* const dest, const ptrdiff_t stride, const void* const top_row,
+    const void* const left_column, const int width, const int height,
+    const int xstep, const int ystep, const bool upsampled_top,
+    const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint8_t top_buffer[288];
+  uint8_t left_buffer[288];
+  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+  const uint8_t* top_ptr = top_buffer + 144;
+  const uint8_t* left_ptr = left_buffer + 144;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  if (width == 4) {
+    DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
+                         upsampled_top, upsampled_left);
+  } else {
+    DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep,
+                       ystep, upsampled_top, upsampled_left);
+  }
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+                                         const ptrdiff_t stride,
+                                         const void* const left_column,
+                                         const int width, const int height,
+                                         const int ystep,
+                                         const bool upsampled_left) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+
+  assert(ystep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_left);
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+
+  if (width == 4 || height == 4) {
+    // This block can handle all sizes but the specializations for other sizes
+    // are faster.
+    const uint8x8_t all = vcreate_u8(0x0706050403020100);
+    const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+    const uint8x8_t base_step_v = upsampled_left ? even : all;
+    const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1));
+
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        uint8_t* dst = static_cast<uint8_t*>(dest);
+        dst += y * stride + x;
+        uint8x8_t left_v[4], right_v[4], value_v[4];
+        const int ystep_base = ystep * x;
+        const int offset = y * base_step;
+
+        const int index_0 = ystep_base + ystep * 1;
+        LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v,
+                     right_step, &left_v[0], &right_v[0]);
+        value_v[0] = WeightedBlend(left_v[0], right_v[0],
+                                   ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_1 = ystep_base + ystep * 2;
+        LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v,
+                     right_step, &left_v[1], &right_v[1]);
+        value_v[1] = WeightedBlend(left_v[1], right_v[1],
+                                   ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_2 = ystep_base + ystep * 3;
+        LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v,
+                     right_step, &left_v[2], &right_v[2]);
+        value_v[2] = WeightedBlend(left_v[2], right_v[2],
+                                   ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_3 = ystep_base + ystep * 4;
+        LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v,
+                     right_step, &left_v[3], &right_v[3]);
+        value_v[3] = WeightedBlend(left_v[3], right_v[3],
+                                   ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+        // 8x4 transpose.
+        const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]);
+        const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]);
+
+        const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]),
+                                         vreinterpret_u16_u8(b1.val[0]));
+        const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]),
+                                         vreinterpret_u16_u8(b1.val[1]));
+
+        StoreLo4(dst, vreinterpret_u8_u16(c0.val[0]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c1.val[0]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c0.val[1]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c1.val[1]));
+
+        if (height > 4) {
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c0.val[0]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c1.val[0]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c0.val[1]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c1.val[1]));
+        }
+        x += 4;
+      } while (x < width);
+      y += 8;
+    } while (y < height);
+  } else {  // 8x8 at a time.
+    // Limited improvement for 8x8. ~20% faster for 64x64.
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        uint8_t* dst = static_cast<uint8_t*>(dest);
+        dst += y * stride + x;
+        const int ystep_base = ystep * (x + 1);
+
+        DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift),
+                                ystep_base, ystep, upsample_shift);
+        x += 8;
+      } while (x < width);
+      y += 8;
+    } while (y < height);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
+  dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_intra_neon.cc
new file mode 100644
index 0000000..411708e
--- /dev/null
+++ b/src/dsp/arm/intrapred_filter_intra_neon.cc
@@ -0,0 +1,176 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+// Transpose kFilterIntraTaps and convert the first row to unsigned values.
+//
+// With the previous orientation we were able to multiply all the input values
+// by a single tap. This required that all the input values be in one vector
+// which requires expensive set up operations (shifts, vext, vtbl). All the
+// elements of the result needed to be summed (easy on A64 - vaddvq_s16) but
+// then the shifting, rounding, and clamping was done in GP registers.
+//
+// Switching to unsigned values allows multiplying the 8 bit inputs directly.
+// When one value was negative we needed to vmovl_u8 first so that the results
+// maintained the proper sign.
+//
+// We take this into account when summing the values by subtracting the product
+// of the first row.
+alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] =
+    {{{6, 5, 3, 3, 4, 3, 3, 3},  // Original values are negative.
+      {10, 2, 1, 1, 6, 2, 2, 1},
+      {0, 10, 1, 1, 0, 6, 2, 2},
+      {0, 0, 10, 2, 0, 0, 6, 2},
+      {0, 0, 0, 10, 0, 0, 0, 6},
+      {12, 9, 7, 5, 2, 2, 2, 3},
+      {0, 0, 0, 0, 12, 9, 7, 5}},
+     {{10, 6, 4, 2, 10, 6, 4, 2},  // Original values are negative.
+      {16, 0, 0, 0, 16, 0, 0, 0},
+      {0, 16, 0, 0, 0, 16, 0, 0},
+      {0, 0, 16, 0, 0, 0, 16, 0},
+      {0, 0, 0, 16, 0, 0, 0, 16},
+      {10, 6, 4, 2, 0, 0, 0, 0},
+      {0, 0, 0, 0, 10, 6, 4, 2}},
+     {{8, 8, 8, 8, 4, 4, 4, 4},  // Original values are negative.
+      {8, 0, 0, 0, 4, 0, 0, 0},
+      {0, 8, 0, 0, 0, 4, 0, 0},
+      {0, 0, 8, 0, 0, 0, 4, 0},
+      {0, 0, 0, 8, 0, 0, 0, 4},
+      {16, 16, 16, 16, 0, 0, 0, 0},
+      {0, 0, 0, 0, 16, 16, 16, 16}},
+     {{2, 1, 1, 0, 1, 1, 1, 1},  // Original values are negative.
+      {8, 3, 2, 1, 4, 3, 2, 2},
+      {0, 8, 3, 2, 0, 4, 3, 2},
+      {0, 0, 8, 3, 0, 0, 4, 3},
+      {0, 0, 0, 8, 0, 0, 0, 4},
+      {10, 6, 4, 2, 3, 4, 4, 3},
+      {0, 0, 0, 0, 10, 6, 4, 3}},
+     {{12, 10, 9, 8, 10, 9, 8, 7},  // Original values are negative.
+      {14, 0, 0, 0, 12, 1, 0, 0},
+      {0, 14, 0, 0, 0, 12, 0, 0},
+      {0, 0, 14, 0, 0, 0, 12, 1},
+      {0, 0, 0, 14, 0, 0, 0, 12},
+      {14, 12, 11, 10, 0, 0, 1, 1},
+      {0, 0, 0, 0, 14, 12, 11, 9}}};
+
+void FilterIntraPredictor_NEON(void* const dest, ptrdiff_t stride,
+                               const void* const top_row,
+                               const void* const left_column,
+                               FilterIntraPredictor pred, int width,
+                               int height) {
+  const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  uint8x8_t transposed_taps[7];
+  for (int i = 0; i < 7; ++i) {
+    transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]);
+  }
+
+  uint8_t relative_top_left = top[-1];
+  const uint8_t* relative_top = top;
+  uint8_t relative_left[2] = {left[0], left[1]};
+
+  int y = 0;
+  do {
+    uint8_t* row_dst = dst;
+    int x = 0;
+    do {
+      uint16x8_t sum = vdupq_n_u16(0);
+      const uint16x8_t subtrahend =
+          vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left));
+      for (int i = 1; i < 5; ++i) {
+        sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1]));
+      }
+      for (int i = 5; i < 7; ++i) {
+        sum =
+            vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5]));
+      }
+
+      const int16x8_t sum_signed =
+          vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend));
+      const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4);
+
+      uint8x8_t sum_saturated = vqmovun_s16(sum_shifted);
+
+      StoreLo4(row_dst, sum_saturated);
+      StoreHi4(row_dst + stride, sum_saturated);
+
+      // Progress across
+      relative_top_left = relative_top[3];
+      relative_top += 4;
+      relative_left[0] = row_dst[3];
+      relative_left[1] = row_dst[3 + stride];
+      row_dst += 4;
+      x += 4;
+    } while (x < width);
+
+    // Progress down.
+    relative_top_left = left[y + 1];
+    relative_top = dst + stride;
+    relative_left[0] = left[y + 2];
+    relative_left[1] = left[y + 3];
+
+    dst += 2 * stride;
+    y += 2;
+  } while (y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->filter_intra_predictor = FilterIntraPredictor_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterIntraInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
new file mode 100644
index 0000000..c967d82
--- /dev/null
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -0,0 +1,1144 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_NEON
+
+using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2,
+                                 const bool use_ref_1, const void* ref_1,
+                                 const int ref_1_size_log2);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc);
+
+// DC intra-predictors for square blocks.
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+struct DcPredFuncs_NEON {
+  DcPredFuncs_NEON() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+};
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
+                      storefn>::DcTop(void* const dest, ptrdiff_t stride,
+                                      const void* const top_row,
+                                      const void* /*left_column*/) {
+  const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
+  const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
+                      storefn>::DcLeft(void* const dest, ptrdiff_t stride,
+                                       const void* /*top_row*/,
+                                       const void* const left_column) {
+  const uint32x2_t sum =
+      sumfn(left_column, block_height_log2, false, nullptr, 0);
+  const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const uint32x2_t sum =
+      sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
+  if (block_width_log2 == block_height_log2) {
+    const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1);
+    storefn(dest, stride, dc);
+  } else {
+    // TODO(johannkoenig): Compare this to mul/shift in vectors.
+    const int divisor = (1 << block_width_log2) + (1 << block_height_log2);
+    uint32_t dc = vget_lane_u32(sum, 0);
+    dc += divisor >> 1;
+    dc /= divisor;
+    storefn(dest, stride, vdup_n_u32(dc));
+  }
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x4_t val) {
+  const uint32x2_t sum = vpaddl_u16(val);
+  return vpadd_u32(sum, sum);
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x8_t val) {
+  const uint32x4_t sum_0 = vpaddlq_u16(val);
+  const uint64x2_t sum_1 = vpaddlq_u32(sum_0);
+  return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)),
+                  vget_high_u32(vreinterpretq_u32_u64(sum_1)));
+}
+
+}  // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// Add and expand the elements in the |val_[01]| to uint16_t but do not sum the
+// entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) {
+  const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+  const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Add and expand the elements in the |val_[0123]| to uint16_t but do not sum
+// the entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1,
+                      const uint8x16_t val_2, const uint8x16_t val_3) {
+  const uint16x8_t sum_0 = Add(val_0, val_1);
+  const uint16x8_t sum_1 = Add(val_2, val_3);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 32 uint8_t values.
+inline uint16x8_t LoadAndAdd32(const uint8_t* buf) {
+  const uint8x16_t val_0 = vld1q_u8(buf);
+  const uint8x16_t val_1 = vld1q_u8(buf + 16);
+  return Add(val_0, val_1);
+}
+
+// Load and combine 64 uint8_t values.
+inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
+  const uint8x16_t val_0 = vld1q_u8(buf);
+  const uint8x16_t val_1 = vld1q_u8(buf + 16);
+  const uint8x16_t val_2 = vld1q_u8(buf + 32);
+  const uint8x16_t val_3 = vld1q_u8(buf + 48);
+  return Add(val_0, val_1, val_2, val_3);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+// For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
+// uint32_t.
+inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
+                             const bool use_ref_1, const void* ref_1,
+                             const int ref_1_size_log2) {
+  const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
+  const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
+  if (ref_0_size_log2 == 2) {
+    uint8x8_t val = Load4(ref_0_u8);
+    if (use_ref_1) {
+      if (ref_1_size_log2 == 2) {  // 4x4
+        val = Load4<1>(ref_1_u8, val);
+        return Sum(vpaddl_u8(val));
+      } else if (ref_1_size_log2 == 3) {  // 4x8
+        const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+        const uint16x4_t sum_0 = vpaddl_u8(val);
+        const uint16x4_t sum_1 = vpaddl_u8(val_1);
+        return Sum(vadd_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 4) {  // 4x16
+        const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+        return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
+      }
+    }
+    // 4x1
+    const uint16x4_t sum = vpaddl_u8(val);
+    return vpaddl_u16(sum);
+  } else if (ref_0_size_log2 == 3) {
+    const uint8x8_t val_0 = vld1_u8(ref_0_u8);
+    if (use_ref_1) {
+      if (ref_1_size_log2 == 2) {  // 8x4
+        const uint8x8_t val_1 = Load4(ref_1_u8);
+        const uint16x4_t sum_0 = vpaddl_u8(val_0);
+        const uint16x4_t sum_1 = vpaddl_u8(val_1);
+        return Sum(vadd_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 3) {  // 8x8
+        const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+        const uint16x4_t sum_0 = vpaddl_u8(val_0);
+        const uint16x4_t sum_1 = vpaddl_u8(val_1);
+        return Sum(vadd_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 4) {  // 8x16
+        const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+        return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
+      } else if (ref_1_size_log2 == 5) {  // 8x32
+        return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
+      }
+    }
+    // 8x1
+    return Sum(vpaddl_u8(val_0));
+  } else if (ref_0_size_log2 == 4) {
+    const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
+    if (use_ref_1) {
+      if (ref_1_size_log2 == 2) {  // 16x4
+        const uint8x8_t val_1 = Load4(ref_1_u8);
+        return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+      } else if (ref_1_size_log2 == 3) {  // 16x8
+        const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+        return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+      } else if (ref_1_size_log2 == 4) {  // 16x16
+        const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+        return Sum(Add(val_0, val_1));
+      } else if (ref_1_size_log2 == 5) {  // 16x32
+        const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 6) {  // 16x64
+        const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+        const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+    // 16x1
+    return Sum(vpaddlq_u8(val_0));
+  } else if (ref_0_size_log2 == 5) {
+    const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
+    if (use_ref_1) {
+      if (ref_1_size_log2 == 3) {  // 32x8
+        const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+        return Sum(vaddw_u8(sum_0, val_1));
+      } else if (ref_1_size_log2 == 4) {  // 32x16
+        const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+        const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 5) {  // 32x32
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 6) {  // 32x64
+        const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+    // 32x1
+    return Sum(sum_0);
+  }
+
+  assert(ref_0_size_log2 == 6);
+  const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
+  if (use_ref_1) {
+    if (ref_1_size_log2 == 4) {  // 64x16
+      const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+      const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+      return Sum(vaddq_u16(sum_0, sum_1));
+    } else if (ref_1_size_log2 == 5) {  // 64x32
+      const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+      return Sum(vaddq_u16(sum_0, sum_1));
+    } else if (ref_1_size_log2 == 6) {  // 64x64
+      const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+      return Sum(vaddq_u16(sum_0, sum_1));
+    }
+  }
+  // 64x1
+  return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+                         const uint32x2_t dc) {
+  const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0);
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (width == 4) {
+    int i = height - 1;
+    do {
+      StoreLo4(dst, vget_low_u8(dc_dup));
+      dst += stride;
+    } while (--i != 0);
+    StoreLo4(dst, vget_low_u8(dc_dup));
+  } else if (width == 8) {
+    int i = height - 1;
+    do {
+      vst1_u8(dst, vget_low_u8(dc_dup));
+      dst += stride;
+    } while (--i != 0);
+    vst1_u8(dst, vget_low_u8(dc_dup));
+  } else if (width == 16) {
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+  } else if (width == 32) {
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      vst1q_u8(dst + 16, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 16, dc_dup);
+  } else {
+    assert(width == 64);
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      vst1q_u8(dst + 16, dc_dup);
+      vst1q_u8(dst + 32, dc_dup);
+      vst1q_u8(dst + 48, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 16, dc_dup);
+    vst1q_u8(dst + 32, dc_dup);
+    vst1q_u8(dst + 48, dc_dup);
+  }
+}
+
+template <int width, int height>
+inline void Paeth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
+                             const void* const top_row,
+                             const void* const left_column) {
+  auto* dest_u8 = static_cast<uint8_t*>(dest);
+  const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+  const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+  const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+  uint8x8_t top;
+  if (width == 4) {
+    top = Load4(top_row_u8);
+  } else {  // width == 8
+    top = vld1_u8(top_row_u8);
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left = vdup_n_u8(left_col_u8[y]);
+
+    const uint8x8_t left_dist = vabd_u8(top, top_left);
+    const uint8x8_t top_dist = vabd_u8(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+    const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+    const uint8x8_t left_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+    const uint8x8_t top_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint8x8_t result = vbsl_u8(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbsl_u8(left_or_top_mask, result, top_left);
+
+    if (width == 4) {
+      StoreLo4(dest_u8, result);
+    } else {  // width == 8
+      vst1_u8(dest_u8, result);
+    }
+    dest_u8 += stride;
+  }
+}
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
+                             const uint16x8_t top_left_dist_low,
+                             const uint16x8_t top_left_dist_high) {
+  // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of
+  // using movl(x_dist).
+  const uint8x8_t x_le_top_left_low =
+      vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low));
+  const uint8x8_t x_le_top_left_high =
+      vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high));
+  return vcombine_u8(x_le_top_left_low, x_le_top_left_high);
+}
+
+// Select the closest values and collect them.
+inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
+                              const uint8x16_t top_left,
+                              const uint8x16_t left_le_top,
+                              const uint8x16_t left_le_top_left,
+                              const uint8x16_t top_le_top_left) {
+  // if (left_dist <= top_dist && left_dist <= top_left_dist)
+  const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+  //   dest[x] = left_column[y];
+  // Fill all the unused spaces with 'top'. They will be overwritten when
+  // the positions for top_left are known.
+  uint8x16_t result = vbslq_u8(left_mask, left, top);
+  // else if (top_dist <= top_left_dist)
+  //   dest[x] = top_row[x];
+  // Add these values to the mask. They were already set.
+  const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+  // else
+  //   dest[x] = top_left;
+  return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num)                                              \
+  const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
+      vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+  const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
+      vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num)                                  \
+  const uint8x16_t left_le_top_left_##num =                    \
+      XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \
+                 top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num)                           \
+  const uint8x16_t top_le_top_left_##num = XLeTopLeft( \
+      top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+template <int width, int height>
+inline void Paeth16PlusxN_NEON(void* const dest, ptrdiff_t stride,
+                               const void* const top_row,
+                               const void* const left_column) {
+  auto* dest_u8 = static_cast<uint8_t*>(dest);
+  const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+  const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+  const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+  uint8x16_t top[4];
+  top[0] = vld1q_u8(top_row_u8);
+  if (width > 16) {
+    top[1] = vld1q_u8(top_row_u8 + 16);
+    if (width == 64) {
+      top[2] = vld1q_u8(top_row_u8 + 32);
+      top[3] = vld1q_u8(top_row_u8 + 48);
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x16_t left = vdupq_n_u8(left_col_u8[y]);
+
+    const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+    const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+    TOP_LEFT_DIST(0);
+    const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+    LEFT_LE_TOP_LEFT(0);
+    TOP_LE_TOP_LEFT(0);
+
+    const uint8x16_t result_0 =
+        SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+                    top_le_top_left_0);
+    vst1q_u8(dest_u8, result_0);
+
+    if (width > 16) {
+      const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+      TOP_LEFT_DIST(1);
+      const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+      LEFT_LE_TOP_LEFT(1);
+      TOP_LE_TOP_LEFT(1);
+
+      const uint8x16_t result_1 =
+          SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1,
+                      top_le_top_left_1);
+      vst1q_u8(dest_u8 + 16, result_1);
+
+      if (width == 64) {
+        const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+        TOP_LEFT_DIST(2);
+        const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+        LEFT_LE_TOP_LEFT(2);
+        TOP_LE_TOP_LEFT(2);
+
+        const uint8x16_t result_2 =
+            SelectPaeth(top[2], left, top_left, left_2_le_top,
+                        left_le_top_left_2, top_le_top_left_2);
+        vst1q_u8(dest_u8 + 32, result_2);
+
+        const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+        TOP_LEFT_DIST(3);
+        const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+        LEFT_LE_TOP_LEFT(3);
+        TOP_LE_TOP_LEFT(3);
+
+        const uint8x16_t result_3 =
+            SelectPaeth(top[3], left, top_left, left_3_le_top,
+                        left_le_top_left_3, top_le_top_left_3);
+        vst1q_u8(dest_u8 + 48, result_3);
+      }
+    }
+
+    dest_u8 += stride;
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+  using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+  using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+  using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+  using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+  using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+  using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+  using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+  using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+  using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+  using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+  using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+  using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+  using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+  using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+  using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+  using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+  using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+  using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Add the elements in the given vectors together but do not sum the entire
+// vector.
+inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1,
+                      const uint16x8_t val_2, const uint16x8_t val_3) {
+  const uint16x8_t sum_0 = vaddq_u16(val_0, val_1);
+  const uint16x8_t sum_1 = vaddq_u16(val_2, val_3);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 16 uint16_t values.
+inline uint16x8_t LoadAndAdd16(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  return vaddq_u16(val_0, val_1);
+}
+
+// Load and combine 32 uint16_t values.
+inline uint16x8_t LoadAndAdd32(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  const uint16x8_t val_2 = vld1q_u16(buf + 16);
+  const uint16x8_t val_3 = vld1q_u16(buf + 24);
+  return Add(val_0, val_1, val_2, val_3);
+}
+
+// Load and combine 64 uint16_t values.
+inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  const uint16x8_t val_2 = vld1q_u16(buf + 16);
+  const uint16x8_t val_3 = vld1q_u16(buf + 24);
+  const uint16x8_t val_4 = vld1q_u16(buf + 32);
+  const uint16x8_t val_5 = vld1q_u16(buf + 40);
+  const uint16x8_t val_6 = vld1q_u16(buf + 48);
+  const uint16x8_t val_7 = vld1q_u16(buf + 56);
+  const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3);
+  const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
+                             const bool use_ref_1, const void* ref_1,
+                             const int ref_1_size_log2) {
+  const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
+  const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
+  if (ref_0_size_log2 == 2) {
+    const uint16x4_t val_0 = vld1_u16(ref_0_u16);
+    if (use_ref_1) {
+      if (ref_1_size_log2 == 2) {  // 4x4
+        const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+        return Sum(vadd_u16(val_0, val_1));
+      } else if (ref_1_size_log2 == 3) {  // 4x8
+        const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+        const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+        return Sum(vaddq_u16(sum_0, val_1));
+      } else if (ref_1_size_log2 == 4) {  // 4x16
+        const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+        const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+    // 4x1
+    return Sum(val_0);
+  } else if (ref_0_size_log2 == 3) {
+    const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
+    if (use_ref_1) {
+      if (ref_1_size_log2 == 2) {  // 8x4
+        const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+        const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+        return Sum(vaddq_u16(val_0, sum_1));
+      } else if (ref_1_size_log2 == 3) {  // 8x8
+        const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+        return Sum(vaddq_u16(val_0, val_1));
+      } else if (ref_1_size_log2 == 4) {  // 8x16
+        const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+        return Sum(vaddq_u16(val_0, sum_1));
+      } else if (ref_1_size_log2 == 5) {  // 8x32
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+        return Sum(vaddq_u16(val_0, sum_1));
+      }
+    }
+    // 8x1
+    return Sum(val_0);
+  } else if (ref_0_size_log2 == 4) {
+    const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
+    if (use_ref_1) {
+      if (ref_1_size_log2 == 2) {  // 16x4
+        const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+        const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 3) {  // 16x8
+        const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, val_1));
+      } else if (ref_1_size_log2 == 4) {  // 16x16
+        const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 5) {  // 16x32
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 6) {  // 16x64
+        const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+    // 16x1
+    return Sum(sum_0);
+  } else if (ref_0_size_log2 == 5) {
+    const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
+    if (use_ref_1) {
+      if (ref_1_size_log2 == 3) {  // 32x8
+        const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, val_1));
+      } else if (ref_1_size_log2 == 4) {  // 32x16
+        const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 5) {  // 32x32
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      } else if (ref_1_size_log2 == 6) {  // 32x64
+        const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+    // 32x1
+    return Sum(sum_0);
+  }
+
+  assert(ref_0_size_log2 == 6);
+  const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
+  if (use_ref_1) {
+    if (ref_1_size_log2 == 4) {  // 64x16
+      const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+      return Sum(vaddq_u16(sum_0, sum_1));
+    } else if (ref_1_size_log2 == 5) {  // 64x32
+      const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+      return Sum(vaddq_u16(sum_0, sum_1));
+    } else if (ref_1_size_log2 == 6) {  // 64x64
+      const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+      return Sum(vaddq_u16(sum_0, sum_1));
+    }
+  }
+  // 64x1
+  return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+                         const uint32x2_t dc) {
+  auto* dest_u16 = static_cast<uint16_t*>(dest);
+  ptrdiff_t stride_u16 = stride >> 1;
+  const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0);
+  if (width == 4) {
+    int i = height - 1;
+    do {
+      vst1_u16(dest_u16, vget_low_u16(dc_dup));
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1_u16(dest_u16, vget_low_u16(dc_dup));
+  } else if (width == 8) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+  } else if (width == 16) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+  } else if (width == 32) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      vst1q_u16(dest_u16 + 16, dc_dup);
+      vst1q_u16(dest_u16 + 24, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+    vst1q_u16(dest_u16 + 16, dc_dup);
+    vst1q_u16(dest_u16 + 24, dc_dup);
+  } else {
+    assert(width == 64);
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      vst1q_u16(dest_u16 + 16, dc_dup);
+      vst1q_u16(dest_u16 + 24, dc_dup);
+      vst1q_u16(dest_u16 + 32, dc_dup);
+      vst1q_u16(dest_u16 + 40, dc_dup);
+      vst1q_u16(dest_u16 + 48, dc_dup);
+      vst1q_u16(dest_u16 + 56, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+    vst1q_u16(dest_u16 + 16, dc_dup);
+    vst1q_u16(dest_u16 + 24, dc_dup);
+    vst1q_u16(dest_u16 + 32, dc_dup);
+    vst1q_u16(dest_u16 + 40, dc_dup);
+    vst1q_u16(dest_u16 + 48, dc_dup);
+    vst1q_u16(dest_u16 + 56, dc_dup);
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+  using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+  using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+  using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+  using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+  using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+  using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+  using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+  using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+  using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+  using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+  using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+  using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+  using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+  using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+  using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+  using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+  using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+  using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
new file mode 100644
index 0000000..16f858c
--- /dev/null
+++ b/src/dsp/arm/intrapred_neon.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor, see the defines below for specifics. These
+// functions are not thread-safe.
+void IntraPredCflInit_NEON();
+void IntraPredDirectionalInit_NEON();
+void IntraPredFilterIntraInit_NEON();
+void IntraPredInit_NEON();
+void IntraPredSmoothInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 8 bit
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+// 10 bit
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
new file mode 100644
index 0000000..abc93e8
--- /dev/null
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -0,0 +1,616 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+    // block dimension = 4
+    255, 149, 85, 64,
+    // block dimension = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // block dimension = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // block dimension = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // block dimension = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+// TODO(b/150459137): Keeping the intermediate values in uint16_t would allow
+// processing more values at once. At the high end, it could do 4x4 or 8x2 at a
+// time.
+inline uint16x4_t CalculatePred(const uint16x4_t weighted_top,
+                                const uint16x4_t weighted_left,
+                                const uint16x4_t weighted_bl,
+                                const uint16x4_t weighted_tr) {
+  const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left);
+  const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr);
+  const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1);
+  return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1);
+}
+
+template <int width, int height>
+inline void Smooth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
+                              const void* const top_row,
+                              const void* const left_column) {
+  const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  uint8x8_t top_v;
+  if (width == 4) {
+    top_v = Load4(top);
+  } else {  // width == 8
+    top_v = vld1_u8(top);
+  }
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  // Over-reads for 4xN but still within the array.
+  const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
+  // 256 - weights = vneg_s8(weights)
+  const uint8x8_t scaled_weights_x =
+      vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+    const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
+    const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v);
+    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+    const uint16x4_t dest_0 =
+        CalculatePred(vget_low_u16(weighted_top), vget_low_u16(weighted_left),
+                      vget_low_u16(weighted_tr), vget_low_u16(weighted_bl));
+
+    if (width == 4) {
+      StoreLo4(dst, vmovn_u16(vcombine_u16(dest_0, dest_0)));
+    } else {  // width == 8
+      const uint16x4_t dest_1 = CalculatePred(
+          vget_high_u16(weighted_top), vget_high_u16(weighted_left),
+          vget_high_u16(weighted_tr), vget_high_u16(weighted_bl));
+      vst1_u8(dst, vmovn_u16(vcombine_u16(dest_0, dest_1)));
+    }
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateWeightsAndPred(
+    const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+    const uint8x8_t weights_y, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+  const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_tr_low =
+      vmull_u8(vget_low_u8(scaled_weights_x), top_right);
+  const uint16x4_t dest_0 = CalculatePred(
+      vget_low_u16(weighted_top_low), vget_low_u16(weighted_left_low),
+      vget_low_u16(weighted_tr_low), vget_low_u16(weighted_bl));
+  const uint16x4_t dest_1 = CalculatePred(
+      vget_high_u16(weighted_top_low), vget_high_u16(weighted_left_low),
+      vget_high_u16(weighted_tr_low), vget_high_u16(weighted_bl));
+  const uint8x8_t dest_0_u8 = vmovn_u16(vcombine_u16(dest_0, dest_1));
+
+  const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_tr_high =
+      vmull_u8(vget_high_u8(scaled_weights_x), top_right);
+  const uint16x4_t dest_2 = CalculatePred(
+      vget_low_u16(weighted_top_high), vget_low_u16(weighted_left_high),
+      vget_low_u16(weighted_tr_high), vget_low_u16(weighted_bl));
+  const uint16x4_t dest_3 = CalculatePred(
+      vget_high_u16(weighted_top_high), vget_high_u16(weighted_left_high),
+      vget_high_u16(weighted_tr_high), vget_high_u16(weighted_bl));
+  const uint8x8_t dest_1_u8 = vmovn_u16(vcombine_u16(dest_2, dest_3));
+
+  return vcombine_u8(dest_0_u8, dest_1_u8);
+}
+
+template <int width, int height>
+inline void Smooth16PlusxN_NEON(void* const dest, ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  uint8x16_t top_v[4];
+  top_v[0] = vld1q_u8(top);
+  if (width > 16) {
+    top_v[1] = vld1q_u8(top + 16);
+    if (width == 64) {
+      top_v[2] = vld1q_u8(top + 32);
+      top_v[3] = vld1q_u8(top + 48);
+    }
+  }
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop.
+  // This currently has a performance slope similar to Paeth so it does not
+  // appear to be register bound for arm64.
+  uint8x16_t weights_x_v[4];
+  weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
+  if (width > 16) {
+    weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+    if (width == 64) {
+      weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+      weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+    }
+  }
+
+  uint8x16_t scaled_weights_x[4];
+  scaled_weights_x[0] =
+      vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0])));
+  if (width > 16) {
+    scaled_weights_x[1] =
+        vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1])));
+    if (width == 64) {
+      scaled_weights_x[2] =
+          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2])));
+      scaled_weights_x[3] =
+          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3])));
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+    vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
+                                          weights_y_v, weights_x_v[0],
+                                          scaled_weights_x[0], weighted_bl));
+
+    if (width > 16) {
+      vst1q_u8(dst + 16, CalculateWeightsAndPred(
+                             top_v[1], left_v, top_right_v, weights_y_v,
+                             weights_x_v[1], scaled_weights_x[1], weighted_bl));
+      if (width == 64) {
+        vst1q_u8(dst + 32,
+                 CalculateWeightsAndPred(top_v[2], left_v, top_right_v,
+                                         weights_y_v, weights_x_v[2],
+                                         scaled_weights_x[2], weighted_bl));
+        vst1q_u8(dst + 48,
+                 CalculateWeightsAndPred(top_v[3], left_v, top_right_v,
+                                         weights_y_v, weights_x_v[3],
+                                         scaled_weights_x[3], weighted_bl));
+      }
+    }
+
+    dst += stride;
+  }
+}
+
+template <int width, int height>
+inline void SmoothVertical4Or8xN_NEON(void* const dest, ptrdiff_t stride,
+                                      const void* const top_row,
+                                      const void* const left_column) {
+  const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  uint8x8_t top_v;
+  if (width == 4) {
+    top_v = Load4(top);
+  } else {  // width == 8
+    top_v = vld1_u8(top);
+  }
+
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+
+    const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+    const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl);
+    const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+
+    if (width == 4) {
+      StoreLo4(dst, pred_scaled);
+    } else {  // width == 8
+      vst1_u8(dst, pred_scaled);
+    }
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateVerticalWeightsAndPred(
+    const uint8x16_t top, const uint8x8_t weights_y,
+    const uint16x8_t weighted_bl) {
+  const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+  const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+  const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl);
+  const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl);
+  const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(pred_high, kSmoothWeightScale);
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+inline void SmoothVertical16PlusxN_NEON(void* const dest, ptrdiff_t stride,
+                                        const void* const top_row,
+                                        const void* const left_column) {
+  const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  uint8x16_t top_v[4];
+  top_v[0] = vld1q_u8(top);
+  if (width > 16) {
+    top_v[1] = vld1q_u8(top + 16);
+    if (width == 64) {
+      top_v[2] = vld1q_u8(top + 32);
+      top_v[3] = vld1q_u8(top + 48);
+    }
+  }
+
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+    const uint8x16_t pred_0 =
+        CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl);
+    vst1q_u8(dst, pred_0);
+
+    if (width > 16) {
+      const uint8x16_t pred_1 =
+          CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl);
+      vst1q_u8(dst + 16, pred_1);
+
+      if (width == 64) {
+        const uint8x16_t pred_2 =
+            CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl);
+        vst1q_u8(dst + 32, pred_2);
+
+        const uint8x16_t pred_3 =
+            CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl);
+        vst1q_u8(dst + 48, pred_3);
+      }
+    }
+
+    dst += stride;
+  }
+}
+
+template <int width, int height>
+inline void SmoothHorizontal4Or8xN_NEON(void* const dest, ptrdiff_t stride,
+                                        const void* const top_row,
+                                        const void* const left_column) {
+  const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  // Over-reads for 4xN but still within the array.
+  const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
+  // 256 - weights = vneg_s8(weights)
+  const uint8x8_t scaled_weights_x =
+      vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x)));
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+
+    const uint16x8_t weighted_left = vmull_u8(weights_x, left_v);
+    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+    const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr);
+    const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+
+    if (width == 4) {
+      StoreLo4(dst, pred_scaled);
+    } else {  // width == 8
+      vst1_u8(dst, pred_scaled);
+    }
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateHorizontalWeightsAndPred(
+    const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x) {
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_tr_low =
+      vmull_u8(vget_low_u8(scaled_weights_x), top_right);
+  const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low);
+  const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_tr_high =
+      vmull_u8(vget_high_u8(scaled_weights_x), top_right);
+  const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(pred_high, kSmoothWeightScale);
+
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+inline void SmoothHorizontal16PlusxN_NEON(void* const dest, ptrdiff_t stride,
+                                          const void* const top_row,
+                                          const void* const left_column) {
+  const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+
+  uint8x16_t weights_x[4];
+  weights_x[0] = vld1q_u8(kSmoothWeights + width - 4);
+  if (width > 16) {
+    weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+    if (width == 64) {
+      weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+      weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+    }
+  }
+
+  uint8x16_t scaled_weights_x[4];
+  scaled_weights_x[0] =
+      vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0])));
+  if (width > 16) {
+    scaled_weights_x[1] =
+        vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1])));
+    if (width == 64) {
+      scaled_weights_x[2] =
+          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2])));
+      scaled_weights_x[3] =
+          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3])));
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+
+    const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred(
+        left_v, top_right_v, weights_x[0], scaled_weights_x[0]);
+    vst1q_u8(dst, pred_0);
+
+    if (width > 16) {
+      const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred(
+          left_v, top_right_v, weights_x[1], scaled_weights_x[1]);
+      vst1q_u8(dst + 16, pred_1);
+
+      if (width == 64) {
+        const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred(
+            left_v, top_right_v, weights_x[2], scaled_weights_x[2]);
+        vst1q_u8(dst + 32, pred_2);
+
+        const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred(
+            left_v, top_right_v, weights_x[3], scaled_weights_x[3]);
+        vst1q_u8(dst + 48, pred_3);
+      }
+    }
+    dst += stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4Or8xN_NEON<4, 4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4Or8xN_NEON<4, 8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4Or8xN_NEON<4, 16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth4Or8xN_NEON<8, 4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth4Or8xN_NEON<8, 8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth4Or8xN_NEON<8, 16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth4Or8xN_NEON<8, 32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
new file mode 100644
index 0000000..072991a
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -0,0 +1,3128 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+// TODO(slavarnway): Move transpose functions to transpose_neon.h or
+// common_neon.h.
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4],
+                                        int16x8_t out[4]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  const int16x4_t a0 = vget_low_s16(in[0]);
+  const int16x4_t a1 = vget_low_s16(in[1]);
+  const int16x4_t a2 = vget_low_s16(in[2]);
+  const int16x4_t a3 = vget_low_s16(in[3]);
+
+  const int16x4x2_t b0 = vtrn_s16(a0, a1);
+  const int16x4x2_t b1 = vtrn_s16(a2, a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]);
+  const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]);
+  const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]);
+  const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]);
+
+  out[0] = vcombine_s16(d0, d0);
+  out[1] = vcombine_s16(d1, d1);
+  out[2] = vcombine_s16(d2, d2);
+  out[3] = vcombine_s16(d3, d3);
+}
+
+// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
+// place version causes additional stack usage with clang.
+LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
+                                        int16x8_t out[8]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+  const int16x8x2_t b2 = vtrnq_s16(in[4], in[5]);
+  const int16x8x2_t b3 = vtrnq_s16(in[6], in[7]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const uint16x8_t in[8],
+                                             uint16x8_t out[4]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  uint16x4x2_t b0 = vtrn_u16(vget_low_u16(in[0]), vget_low_u16(in[1]));
+  uint16x4x2_t b1 = vtrn_u16(vget_low_u16(in[2]), vget_low_u16(in[3]));
+  uint16x4x2_t b2 = vtrn_u16(vget_low_u16(in[4]), vget_low_u16(in[5]));
+  uint16x4x2_t b3 = vtrn_u16(vget_low_u16(in[6]), vget_low_u16(in[7]));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                             vreinterpret_u32_u16(b1.val[0]));
+  uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+                             vreinterpret_u32_u16(b1.val[1]));
+  uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+                             vreinterpret_u32_u16(b3.val[0]));
+  uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+                             vreinterpret_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  out[0] = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+                        vreinterpret_u16_u32(c2.val[0]));
+  out[1] = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+                        vreinterpret_u16_u32(c3.val[0]));
+  out[2] = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+                        vreinterpret_u16_u32(c2.val[1]));
+  out[3] = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+                        vreinterpret_u16_u32(c3.val[1]));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const int16x8_t in[8],
+                                             int16x8_t out[4]) {
+  Transpose4x8To8x4(reinterpret_cast<const uint16x8_t*>(in),
+                    reinterpret_cast<uint16x8_t*>(out));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8(const int16x8_t in[4],
+                                             int16x8_t out[8]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+
+  // The upper 8 bytes are don't cares.
+  // out[0]: 00 10 20 30 04 14 24 34
+  // out[1]: 01 11 21 31 05 15 25 35
+  // out[2]: 02 12 22 32 06 16 26 36
+  // out[3]: 03 13 23 33 07 17 27 37
+  // out[4]: 04 14 24 34 04 14 24 34
+  // out[5]: 05 15 25 35 05 15 25 35
+  // out[6]: 06 16 26 36 06 16 26 36
+  // out[7]: 07 17 27 37 07 17 27 37
+  out[0] = vreinterpretq_s16_s32(c0.val[0]);
+  out[1] = vreinterpretq_s16_s32(c1.val[0]);
+  out[2] = vreinterpretq_s16_s32(c0.val[1]);
+  out[3] = vreinterpretq_s16_s32(c1.val[1]);
+  out[4] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[0])));
+  out[5] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c1.val[0]), vget_high_s32(c1.val[0])));
+  out[6] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c0.val[1]), vget_high_s32(c0.val[1])));
+  out[7] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c1.val[1]), vget_high_s32(c1.val[1])));
+}
+
+//------------------------------------------------------------------------------
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
+                                    const int16x8_t* const s) {
+  assert(store_count % 4 == 0);
+  assert(store_width == 8 || store_width == 16);
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (store_width == 16) {
+    for (int i = 0; i < store_count; i += 4) {
+      vst1q_s16(&dst[i * stride + idx], (s[i]));
+      vst1q_s16(&dst[(i + 1) * stride + idx], (s[i + 1]));
+      vst1q_s16(&dst[(i + 2) * stride + idx], (s[i + 2]));
+      vst1q_s16(&dst[(i + 3) * stride + idx], (s[i + 3]));
+    }
+  } else {
+    // store_width == 8
+    for (int i = 0; i < store_count; i += 4) {
+      vst1_s16(&dst[i * stride + idx], vget_low_s16(s[i]));
+      vst1_s16(&dst[(i + 1) * stride + idx], vget_low_s16(s[i + 1]));
+      vst1_s16(&dst[(i + 2) * stride + idx], vget_low_s16(s[i + 2]));
+      vst1_s16(&dst[(i + 3) * stride + idx], vget_low_s16(s[i + 3]));
+    }
+  }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
+                                   int32_t idx, int16x8_t* x) {
+  assert(load_count % 4 == 0);
+  assert(load_width == 8 || load_width == 16);
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (load_width == 16) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = vld1q_s16(&src[i * stride + idx]);
+      x[i + 1] = vld1q_s16(&src[(i + 1) * stride + idx]);
+      x[i + 2] = vld1q_s16(&src[(i + 2) * stride + idx]);
+      x[i + 3] = vld1q_s16(&src[(i + 3) * stride + idx]);
+    }
+  } else {
+    // load_width == 8
+    const int64x2_t zero = vdupq_n_s64(0);
+    for (int i = 0; i < load_count; i += 4) {
+      // The src buffer is aligned to 32 bytes.  Each load will always be 8
+      // byte aligned.
+      x[i] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[i * stride + idx]), zero, 0));
+      x[i + 1] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 1) * stride + idx]), zero,
+          0));
+      x[i + 2] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 2) * stride + idx]), zero,
+          0));
+      x[i + 3] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 3) * stride + idx]), zero,
+          0));
+    }
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int16x8_t* a, int16x8_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+  const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+  const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+  const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+  const int16x8_t x = vcombine_s16(x1, x1);
+  const int16x8_t y = vcombine_s16(y1, y1);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(int16x8_t* a, int16x8_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+  const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+  const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+  const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+  const int32x4_t acc_x_hi = vmull_n_s16(vget_high_s16(*a), cos128);
+  const int32x4_t acc_y_hi = vmull_n_s16(vget_high_s16(*a), sin128);
+  const int32x4_t x0_hi = vmlsl_n_s16(acc_x_hi, vget_high_s16(*b), sin128);
+  const int32x4_t y0_hi = vmlal_n_s16(acc_y_hi, vget_high_s16(*b), cos128);
+  const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+  const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+  const int16x8_t x = vcombine_s16(x1, x1_hi);
+  const int16x8_t y = vcombine_s16(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a,
+                                                         int16x8_t* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  // For this function, the max value returned by Sin128() is 4091, which fits
+  // inside 12 bits.  This leaves room for the sign bit and the 3 left shifted
+  // bits.
+  assert(sin128 <= 0xfff);
+  const int16x8_t x = vqrdmulhq_n_s16(*b, -sin128 << 3);
+  const int16x8_t y = vqrdmulhq_n_s16(*b, cos128 << 3);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int16x8_t* a,
+                                                          int16x8_t* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int16x8_t x = vqrdmulhq_n_s16(*a, cos128 << 3);
+  const int16x8_t y = vqrdmulhq_n_s16(*a, sin128 << 3);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int16x8_t* a, int16x8_t* b,
+                                            bool flip) {
+  int16x8_t x, y;
+  if (flip) {
+    y = vqaddq_s16(*b, *a);
+    x = vqsubq_s16(*b, *a);
+  } else {
+    x = vqaddq_s16(*a, *b);
+    y = vqsubq_s16(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(int16x8_t* a, int16x8_t* b, int angle,
+                                       bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  const int16x8_t s0 = vbslq_s16(v_mask, v_src_round, v_src);
+  const int16_t cos128 = Cos128(32);
+  const int16x8_t xy = vqrdmulhq_n_s16(s0, cos128 << 3);
+  // vqrshlq_s16 will shift right if shift value is negative.
+  const int16x8_t xy_shifted = vqrshlq_s16(xy, vdupq_n_s16(-row_shift));
+
+  if (width == 4) {
+    vst1_s16(dst, vget_low_s16(xy_shifted));
+  } else {
+    for (int i = 0; i < width; i += 8) {
+      vst1q_s16(dst, xy_shifted);
+      dst += 8;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const int16x4_t v_src = vld1_s16(dst);
+    const int16x4_t xy = vqrdmulh_n_s16(v_src, cos128 << 3);
+    vst1_s16(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const int16x8_t v_src = vld1q_s16(&dst[i]);
+      const int16x8_t xy = vqrdmulhq_n_s16(v_src, cos128 << 3);
+      vst1q_s16(&dst[i], xy);
+      i += 8;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int16x8_t* s) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  HadamardRotation(&s[0], &s[3], false);
+  HadamardRotation(&s[1], &s[2], false);
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[4], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4(x, x);
+    }
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[8];
+      Transpose8x4To4x8(s, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4(s, s);
+    }
+    StoreDst<8, 4>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int16x8_t* s) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false);
+  HadamardRotation(&s[6], &s[7], true);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  HadamardRotation(&s[0], &s[7], false);
+  HadamardRotation(&s[1], &s[6], false);
+  HadamardRotation(&s[2], &s[5], false);
+  HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else if (transpose) {
+    LoadSrc<16, 8>(dst, step, 0, x);
+    dsp::Transpose8x8(x);
+  } else {
+    LoadSrc<16, 8>(dst, step, 0, x);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, s);
+    }
+  } else if (transpose) {
+    dsp::Transpose8x8(s);
+    StoreDst<16, 8>(dst, step, 0, s);
+  } else {
+    StoreDst<16, 8>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int16x8_t* s) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false);
+  HadamardRotation(&s[10], &s[11], true);
+  HadamardRotation(&s[12], &s[13], false);
+  HadamardRotation(&s[14], &s[15], true);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false);
+  HadamardRotation(&s[9], &s[10], false);
+  HadamardRotation(&s[12], &s[15], true);
+  HadamardRotation(&s[13], &s[14], true);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  HadamardRotation(&s[0], &s[15], false);
+  HadamardRotation(&s[1], &s[14], false);
+  HadamardRotation(&s[2], &s[13], false);
+  HadamardRotation(&s[3], &s[12], false);
+  HadamardRotation(&s[4], &s[11], false);
+  HadamardRotation(&s[5], &s[10], false);
+  HadamardRotation(&s[6], &s[9], false);
+  HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    LoadSrc<16, 16>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+  Dct16Stages<butterfly_rotation>(s);
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (int i = 0; i < 16; ++i) {
+      s[i] = vqrshlq_s16(s[i], v_row_shift);
+    }
+  }
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4(&s[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, s);
+    }
+  } else if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      dsp::Transpose8x8(&s[idx]);
+      StoreDst<16, 8>(dst, step, idx, &s[idx]);
+    }
+  } else {
+    StoreDst<16, 16>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int16x8_t* s) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false);
+  HadamardRotation(&s[18], &s[19], true);
+  HadamardRotation(&s[20], &s[21], false);
+  HadamardRotation(&s[22], &s[23], true);
+  HadamardRotation(&s[24], &s[25], false);
+  HadamardRotation(&s[26], &s[27], true);
+  HadamardRotation(&s[28], &s[29], false);
+  HadamardRotation(&s[30], &s[31], true);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false);
+  HadamardRotation(&s[17], &s[18], false);
+  HadamardRotation(&s[20], &s[23], true);
+  HadamardRotation(&s[21], &s[22], true);
+  HadamardRotation(&s[24], &s[27], false);
+  HadamardRotation(&s[25], &s[26], false);
+  HadamardRotation(&s[28], &s[31], true);
+  HadamardRotation(&s[29], &s[30], true);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false);
+  HadamardRotation(&s[17], &s[22], false);
+  HadamardRotation(&s[18], &s[21], false);
+  HadamardRotation(&s[19], &s[20], false);
+  HadamardRotation(&s[24], &s[31], true);
+  HadamardRotation(&s[25], &s[30], true);
+  HadamardRotation(&s[26], &s[29], true);
+  HadamardRotation(&s[27], &s[28], true);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  HadamardRotation(&s[0], &s[31], false);
+  HadamardRotation(&s[1], &s[30], false);
+  HadamardRotation(&s[2], &s[29], false);
+  HadamardRotation(&s[3], &s[28], false);
+  HadamardRotation(&s[4], &s[27], false);
+  HadamardRotation(&s[5], &s[26], false);
+  HadamardRotation(&s[6], &s[25], false);
+  HadamardRotation(&s[7], &s[24], false);
+  HadamardRotation(&s[8], &s[23], false);
+  HadamardRotation(&s[9], &s[22], false);
+  HadamardRotation(&s[10], &s[21], false);
+  HadamardRotation(&s[11], &s[20], false);
+  HadamardRotation(&s[12], &s[19], false);
+  HadamardRotation(&s[13], &s[18], false);
+  HadamardRotation(&s[14], &s[17], false);
+  HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[32], x[32];
+
+  if (is_row) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_8>(s);
+  Dct8Stages<ButterflyRotation_8>(s);
+  Dct16Stages<ButterflyRotation_8>(s);
+  Dct32Stages<ButterflyRotation_8>(s);
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (int idx = 0; idx < 32; idx += 8) {
+      int16x8_t output[8];
+      Transpose8x8(&s[idx], output);
+      for (int i = 0; i < 8; ++i) {
+        output[i] = vqrshlq_s16(output[i], v_row_shift);
+      }
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 32>(dst, step, 0, s);
+  }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[64], x[32];
+
+  if (is_row) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false);
+  HadamardRotation(&s[34], &s[35], true);
+  HadamardRotation(&s[36], &s[37], false);
+  HadamardRotation(&s[38], &s[39], true);
+  HadamardRotation(&s[40], &s[41], false);
+  HadamardRotation(&s[42], &s[43], true);
+  HadamardRotation(&s[44], &s[45], false);
+  HadamardRotation(&s[46], &s[47], true);
+  HadamardRotation(&s[48], &s[49], false);
+  HadamardRotation(&s[50], &s[51], true);
+  HadamardRotation(&s[52], &s[53], false);
+  HadamardRotation(&s[54], &s[55], true);
+  HadamardRotation(&s[56], &s[57], false);
+  HadamardRotation(&s[58], &s[59], true);
+  HadamardRotation(&s[60], &s[61], false);
+  HadamardRotation(&s[62], &s[63], true);
+
+  // stage 7.
+  ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false);
+  HadamardRotation(&s[33], &s[34], false);
+  HadamardRotation(&s[36], &s[39], true);
+  HadamardRotation(&s[37], &s[38], true);
+  HadamardRotation(&s[40], &s[43], false);
+  HadamardRotation(&s[41], &s[42], false);
+  HadamardRotation(&s[44], &s[47], true);
+  HadamardRotation(&s[45], &s[46], true);
+  HadamardRotation(&s[48], &s[51], false);
+  HadamardRotation(&s[49], &s[50], false);
+  HadamardRotation(&s[52], &s[55], true);
+  HadamardRotation(&s[53], &s[54], true);
+  HadamardRotation(&s[56], &s[59], false);
+  HadamardRotation(&s[57], &s[58], false);
+  HadamardRotation(&s[60], &s[63], true);
+  HadamardRotation(&s[61], &s[62], true);
+
+  // stage 16.
+  ButterflyRotation_8(&s[61], &s[34], 56, true);
+  ButterflyRotation_8(&s[60], &s[35], 56, true);
+  ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false);
+  HadamardRotation(&s[33], &s[38], false);
+  HadamardRotation(&s[34], &s[37], false);
+  HadamardRotation(&s[35], &s[36], false);
+  HadamardRotation(&s[40], &s[47], true);
+  HadamardRotation(&s[41], &s[46], true);
+  HadamardRotation(&s[42], &s[45], true);
+  HadamardRotation(&s[43], &s[44], true);
+  HadamardRotation(&s[48], &s[55], false);
+  HadamardRotation(&s[49], &s[54], false);
+  HadamardRotation(&s[50], &s[53], false);
+  HadamardRotation(&s[51], &s[52], false);
+  HadamardRotation(&s[56], &s[63], true);
+  HadamardRotation(&s[57], &s[62], true);
+  HadamardRotation(&s[58], &s[61], true);
+  HadamardRotation(&s[59], &s[60], true);
+
+  // stage 25.
+  ButterflyRotation_8(&s[59], &s[36], 48, true);
+  ButterflyRotation_8(&s[58], &s[37], 48, true);
+  ButterflyRotation_8(&s[57], &s[38], 48, true);
+  ButterflyRotation_8(&s[56], &s[39], 48, true);
+  ButterflyRotation_8(&s[55], &s[40], 112, true);
+  ButterflyRotation_8(&s[54], &s[41], 112, true);
+  ButterflyRotation_8(&s[53], &s[42], 112, true);
+  ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false);
+  HadamardRotation(&s[33], &s[46], false);
+  HadamardRotation(&s[34], &s[45], false);
+  HadamardRotation(&s[35], &s[44], false);
+  HadamardRotation(&s[36], &s[43], false);
+  HadamardRotation(&s[37], &s[42], false);
+  HadamardRotation(&s[38], &s[41], false);
+  HadamardRotation(&s[39], &s[40], false);
+  HadamardRotation(&s[48], &s[63], true);
+  HadamardRotation(&s[49], &s[62], true);
+  HadamardRotation(&s[50], &s[61], true);
+  HadamardRotation(&s[51], &s[60], true);
+  HadamardRotation(&s[52], &s[59], true);
+  HadamardRotation(&s[53], &s[58], true);
+  HadamardRotation(&s[54], &s[57], true);
+  HadamardRotation(&s[55], &s[56], true);
+
+  // stage 30.
+  ButterflyRotation_8(&s[55], &s[40], 32, true);
+  ButterflyRotation_8(&s[54], &s[41], 32, true);
+  ButterflyRotation_8(&s[53], &s[42], 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 32, true);
+  ButterflyRotation_8(&s[50], &s[45], 32, true);
+  ButterflyRotation_8(&s[49], &s[46], 32, true);
+  ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+  }
+  //-- end dct 64 stages
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (int idx = 0; idx < 64; idx += 8) {
+      int16x8_t output[8];
+      Transpose8x8(&s[idx], output);
+      for (int i = 0; i < 8; ++i) {
+        output[i] = vqrshlq_s16(output[i], v_row_shift);
+      }
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 64>(dst, step, 0, s);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
+                                      bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int32x4_t s[8];
+  int16x8_t x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4(x, x);
+    }
+  }
+
+  // stage 1.
+  s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]);
+  s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]);
+
+  // stage 2.
+  const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2]));
+  const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3]));
+
+  // stage 3.
+  s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]);
+  s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]);
+  // s[0] = s[0] + s[3]
+  s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]);
+  // s[1] = s[1] - s[4]
+  s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]);
+
+  s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]);
+  s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+  // stage 4.
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[6]);
+
+  // stages 5 and 6.
+  const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+  const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+  const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+  const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+  const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+  const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12);
+  const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+  x[0] = vcombine_s16(dst_0, dst_0);
+  x[1] = vcombine_s16(dst_1, dst_1);
+  x[2] = vcombine_s16(dst_2, dst_2);
+  x[3] = vcombine_s16(dst_3, dst_3);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[8];
+      Transpose8x4To4x8(x, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4(x, x);
+    }
+    StoreDst<8, 4>(dst, step, 0, x);
+  }
+}
+
+alignas(8) constexpr int16_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+                                                          2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int32x4_t s[2];
+
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int16x4_t kAdst4DcOnlyMultipliers = vld1_s16(kAdst4DcOnlyMultiplier);
+  s[1] = vdupq_n_s32(0);
+
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  s[0] = vmull_s16(kAdst4DcOnlyMultipliers, v_src);
+  // 0     0     0     s0*k0
+  s[1] = vextq_s32(s[1], s[0], 1);
+
+  const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+  const int16x4_t dst_0 = vqrshrn_n_s32(x3, 12);
+
+  // vqrshlq_s16 will shift right if shift value is negative.
+  vst1_s16(dst, vqrshl_s16(dst_0, vdup_n_s16(-row_shift)));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int32x4_t s[4];
+
+  int i = 0;
+  do {
+    const int16x4_t v_src = vld1_s16(&dst[i]);
+
+    s[0] = vmull_n_s16(v_src, kAdst4Multiplier[0]);
+    s[1] = vmull_n_s16(v_src, kAdst4Multiplier[1]);
+    s[2] = vmull_n_s16(v_src, kAdst4Multiplier[2]);
+
+    const int32x4_t x0 = s[0];
+    const int32x4_t x1 = s[1];
+    const int32x4_t x2 = s[2];
+    const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+    const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+    const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+    const int16x4_t dst_2 = vqrshrn_n_s32(x2, 12);
+    const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+    vst1_s16(&dst[i], dst_0);
+    vst1_s16(&dst[i + width * 1], dst_1);
+    vst1_s16(&dst[i + width * 2], dst_2);
+    vst1_s16(&dst[i + width * 3], dst_3);
+
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step,
+                                      bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      LoadSrc<16, 8>(dst, step, 0, x);
+      dsp::Transpose8x8(x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[3], &s[7], false);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s16(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s16(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s16(s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      dsp::Transpose8x8(x);
+      StoreDst<16, 8>(dst, step, 0, x);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8];
+
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  // stage 1.
+  s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  int16x8_t x[8];
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s16(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s16(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s16(s[1]);
+
+  for (int i = 0; i < 8; ++i) {
+    // vqrshlq_s16 will shift right if shift value is negative.
+    x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+    vst1q_lane_s16(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8];
+
+  int i = 0;
+  do {
+    const int16x8_t v_src = vld1q_s16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    int16x8_t x[8];
+    x[0] = s[0];
+    x[1] = vqnegq_s16(s[4]);
+    x[2] = s[6];
+    x[3] = vqnegq_s16(s[2]);
+    x[4] = s[3];
+    x[5] = vqnegq_s16(s[7]);
+    x[6] = s[5];
+    x[7] = vqnegq_s16(s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+                                       int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (is_row) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+        dsp::Transpose8x8(&x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false);
+  HadamardRotation(&s[1], &s[9], false);
+  HadamardRotation(&s[2], &s[10], false);
+  HadamardRotation(&s[3], &s[11], false);
+  HadamardRotation(&s[4], &s[12], false);
+  HadamardRotation(&s[5], &s[13], false);
+  HadamardRotation(&s[6], &s[14], false);
+  HadamardRotation(&s[7], &s[15], false);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[8], &s[12], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[9], &s[13], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[10], &s[14], false);
+  HadamardRotation(&s[3], &s[7], false);
+  HadamardRotation(&s[11], &s[15], false);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[8], &s[10], false);
+  HadamardRotation(&s[12], &s[14], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+  HadamardRotation(&s[9], &s[11], false);
+  HadamardRotation(&s[13], &s[15], false);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s16(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s16(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s16(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s16(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s16(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s16(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s16(s[1]);
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+      int16x8_t output[4];
+      Transpose4x8To8x4(x, output);
+      for (int i = 0; i < 4; ++i) {
+        output[i] = vqrshlq_s16(output[i], v_row_shift);
+      }
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4(&x[8], output);
+      for (int i = 0; i < 4; ++i) {
+        output[i] = vqrshlq_s16(output[i], v_row_shift);
+      }
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (is_row) {
+      const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+      for (int idx = 0; idx < 16; idx += 8) {
+        int16x8_t output[8];
+        Transpose8x8(&x[idx], output);
+        for (int i = 0; i < 8; ++i) {
+          output[i] = vqrshlq_s16(output[i], v_row_shift);
+        }
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int16x8_t* s, int16x8_t* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s16(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s16(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s16(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s16(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s16(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s16(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s16(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16];
+  int16x8_t x[16];
+
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  // stage 1.
+  s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 16; ++i) {
+    // vqrshlq_s16 will shift right if shift value is negative.
+    x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+    vst1q_lane_s16(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    int16x8_t s[16];
+    int16x8_t x[16];
+    const int16x8_t v_src = vld1q_s16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  if (is_row_shift) {
+    const int shift = 1;
+    const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+    const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+    const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+    for (int i = 0; i < 4; i += 2) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+      const int32x4_t v_src_mult_lo =
+          vmlal_s16(v_dual_round, vget_low_s16(v_src), v_multiplier);
+      const int32x4_t v_src_mult_hi =
+          vmlal_s16(v_dual_round, vget_high_s16(v_src), v_multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s16(&dst[i * step],
+                vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+    }
+  } else {
+    for (int i = 0; i < 4; i += 2) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+      const int16x8_t a =
+          vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+      const int16x8_t b = vqaddq_s16(v_src, a);
+      vst1q_s16(&dst[i * step], b);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int shift = tx_height < 16 ? 0 : 1;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo = vmlal_s16(v_dual_round, v_src, v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int16_t* source) {
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+
+  if (identity_size < 32) {
+    if (tx_width == 4) {
+      uint8x8_t frame_data = vdup_n_u8(0);
+      int i = 0;
+      do {
+        const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+
+        int16x4_t v_dst_i;
+        if (identity_size == 4) {
+          const int16x4_t v_src_fraction =
+              vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+          v_dst_i = vqadd_s16(v_src, v_src_fraction);
+        } else if (identity_size == 8) {
+          v_dst_i = vqadd_s16(v_src, v_src);
+        } else {  // identity_size == 16
+          const int16x4_t v_src_mult =
+              vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+          const int16x4_t v_srcx2 = vqadd_s16(v_src, v_src);
+          v_dst_i = vqadd_s16(v_srcx2, v_src_mult);
+        }
+
+        frame_data = Load4<0>(dst, frame_data);
+        const int16x4_t a = vrshr_n_s16(v_dst_i, 4);
+        const uint16x8_t b =
+            vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        StoreLo4(dst, d);
+        dst += stride;
+      } while (++i < tx_height);
+    } else {
+      int i = 0;
+      do {
+        const int row = i * tx_width;
+        int j = 0;
+        do {
+          const int16x8_t v_src = vld1q_s16(&source[row + j]);
+
+          int16x8_t v_dst_i;
+          if (identity_size == 4) {
+            const int16x8_t v_src_fraction =
+                vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+            v_dst_i = vqaddq_s16(v_src, v_src_fraction);
+          } else if (identity_size == 8) {
+            v_dst_i = vqaddq_s16(v_src, v_src);
+          } else {  // identity_size == 16
+            const int16x8_t v_src_mult =
+                vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+            const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+            v_dst_i = vqaddq_s16(v_src_mult, v_srcx2);
+          }
+
+          const uint8x8_t frame_data = vld1_u8(dst + j);
+          const int16x8_t a = vrshrq_n_s16(v_dst_i, 4);
+          const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+          const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+          vst1_u8(dst + j, d);
+          j += 8;
+        } while (j < tx_width);
+        dst += stride;
+      } while (++i < tx_height);
+    }
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int16x8_t v_dst_i = vld1q_s16(&source[row + j]);
+        const uint8x8_t frame_data = vld1_u8(dst + j);
+        const int16x8_t a = vrshrq_n_s16(v_dst_i, 2);
+        const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        vst1_u8(dst + j, d);
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int16_t* source) {
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+
+  if (tx_width == 4) {
+    uint8x8_t frame_data = vdup_n_u8(0);
+    int i = 0;
+    do {
+      const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+      const int16x4_t v_src_mult =
+          vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+      const int16x4_t v_dst_row = vqadd_s16(v_src, v_src_mult);
+      const int16x4_t v_src_mult2 =
+          vqrdmulh_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+      const int16x4_t v_dst_col = vqadd_s16(v_dst_row, v_src_mult2);
+      frame_data = Load4<0>(dst, frame_data);
+      const int16x4_t a = vrshr_n_s16(v_dst_col, 4);
+      const uint16x8_t b =
+          vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      StoreLo4(dst, d);
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int16x8_t v_src = vld1q_s16(&source[row + j]);
+        const int16x8_t v_src_round =
+            vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+        const int16x8_t v_dst_row = vqaddq_s16(v_src_round, v_src_round);
+        const int16x8_t v_src_mult2 =
+            vqrdmulhq_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+        const int16x8_t v_dst_col = vqaddq_s16(v_dst_row, v_src_mult2);
+        const uint8x8_t frame_data = vld1_u8(dst + j);
+        const int16x8_t a = vrshrq_n_s16(v_dst_col, 4);
+        const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        vst1_u8(dst + j, d);
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  for (int i = 0; i < 4; ++i) {
+    const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+    const int16x8_t a = vrshrq_n_s16(v_src, 1);
+    vst1q_s16(&dst[i * step], a);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  for (int i = 0; i < 4; ++i) {
+    const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+    // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+    // saturating add here is ok.
+    const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+    vst1q_s16(&dst[i * step], v_srcx2);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int32x4_t v_srcx2 = vaddl_s16(v_src, v_src);
+  const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step + j * 8]);
+      const int32x4_t v_src_mult_lo =
+          vmlal_n_s16(v_dual_round, vget_low_s16(v_src), kIdentity16Multiplier);
+      const int32x4_t v_src_mult_hi = vmlal_n_s16(
+          v_dual_round, vget_high_s16(v_src), kIdentity16Multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s16(&dst[i * step + j * 8],
+                vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int16x4_t v_multiplier = vdup_n_s16(kIdentity16Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo =
+      vmlal_s16(v_dual_round, (v_src), v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+                                                const int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 32; j += 8) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step + j]);
+      // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+      // saturating add here is ok.
+      const int16x8_t v_dst_i = vqaddq_s16(v_src, v_src);
+      vst1q_s16(&dst[i * step + j], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const int16x4_t v_src = vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const int16x4_t v_dst_0 = vqadd_s16(v_src, v_src);
+  vst1_lane_s16(dst, v_dst_0, 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Transposes a 4x4 matrix and then permutes the rows of the transposed matrix
+// for the WHT. The input matrix is in two "wide" int16x8_t variables. The
+// output matrix is in four int16x4_t variables.
+//
+// Input:
+// in[0]: 00 01 02 03  10 11 12 13
+// in[1]: 20 21 22 23  30 31 32 33
+// Output:
+// out[0]: 00 10 20 30
+// out[1]: 03 13 23 33
+// out[2]: 01 11 21 31
+// out[3]: 02 12 22 32
+LIBGAV1_ALWAYS_INLINE void TransposeAndPermute4x4WideInput(
+    const int16x8_t in[2], int16x4_t out[4]) {
+  // Swap 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03  10 11 12 13
+  // in[1]: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+
+  // Swap 16 bit elements. Goes from:
+  // vget_low_s32(b0.val[0]):  00 01 20 21
+  // vget_high_s32(b0.val[0]): 10 11 30 31
+  // vget_low_s32(b0.val[1]):  02 03 22 23
+  // vget_high_s32(b0.val[1]): 12 13 32 33
+  // to:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 01 11 21 32
+  // c1.val[0]: 02 12 22 32
+  // c1.val[1]: 03 13 23 33
+
+  const int16x4x2_t c0 =
+      vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+               vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+  const int16x4x2_t c1 =
+      vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+               vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+
+  out[0] = c0.val[0];
+  out[1] = c1.val[1];
+  out[2] = c0.val[1];
+  out[3] = c1.val[0];
+}
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* dst, const int dst_stride,
+                                     const void* source,
+                                     const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int16_t*>(source);
+  int16x4_t s[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int16_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int16_t g = f >> 1;
+    f = f - (f >> 1);
+    const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int16_t i = (src[0] >> 4);
+    s[0] = vdup_n_s16(h);
+    s[0] = vset_lane_s16(f, s[0], 0);
+    s[1] = vdup_n_s16(i);
+    s[1] = vset_lane_s16(g, s[1], 0);
+    s[2] = s[3] = s[1];
+  } else {
+    // Load the 4x4 source in transposed form.
+    int16x4x4_t columns = vld4_s16(src);
+    // Shift right and permute the columns for the WHT.
+    s[0] = vshr_n_s16(columns.val[0], 2);
+    s[2] = vshr_n_s16(columns.val[1], 2);
+    s[3] = vshr_n_s16(columns.val[2], 2);
+    s[1] = vshr_n_s16(columns.val[3], 2);
+
+    // Row transforms.
+    s[0] = vadd_s16(s[0], s[2]);
+    s[3] = vsub_s16(s[3], s[1]);
+    int16x4_t e = vhsub_s16(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsub_s16(e, s[1]);
+    s[2] = vsub_s16(e, s[2]);
+    s[0] = vsub_s16(s[0], s[1]);
+    s[3] = vadd_s16(s[3], s[2]);
+
+    int16x8_t x[2];
+    x[0] = vcombine_s16(s[0], s[1]);
+    x[1] = vcombine_s16(s[2], s[3]);
+    TransposeAndPermute4x4WideInput(x, s);
+
+    // Column transforms.
+    s[0] = vadd_s16(s[0], s[2]);
+    s[3] = vsub_s16(s[3], s[1]);
+    e = vhsub_s16(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsub_s16(e, s[1]);
+    s[2] = vsub_s16(e, s[2]);
+    s[0] = vsub_s16(s[0], s[1]);
+    s[3] = vadd_s16(s[3], s[2]);
+  }
+
+  // Store to frame.
+  uint8x8_t frame_data = vdup_n_u8(0);
+  for (int row = 0; row < 4; row += 2) {
+    frame_data = Load4<0>(dst, frame_data);
+    frame_data = Load4<1>(dst + dst_stride, frame_data);
+    const int16x8_t residual = vcombine_s16(s[row], s[row + 1]);
+    const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(residual), frame_data);
+    frame_data = vqmovun_s16(vreinterpretq_s16_u16(b));
+    StoreLo4(dst, frame_data);
+    dst += dst_stride;
+    StoreHi4(dst, frame_data);
+    dst += dst_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vld1q_s16(&source[i + 8]);
+      const int16x8_t c = vrev64q_s16(a);
+      const int16x8_t d = vrev64q_s16(b);
+      vst1q_s16(&source[i], vcombine_s16(vget_high_s16(d), vget_low_s16(d)));
+      vst1q_s16(&source[i + 8],
+                vcombine_s16(vget_high_s16(c), vget_low_s16(c)));
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vrev64q_s16(a);
+      vst1q_s16(&source[i], vcombine_s16(vget_high_s16(b), vget_low_s16(b)));
+    }
+  } else {
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      vst1q_s16(&source[i], vrev64q_s16(a));
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+      vst1q_s16(&source[i], b);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      // The last 32 values of every row are always zero if the |tx_width| is
+      // 64.
+      const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+      int j = 0;
+      do {
+        const int16x8_t a = vld1q_s16(&source[i * tx_width + j]);
+        const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+        vst1q_s16(&source[i * tx_width + j], b);
+        j += 8;
+      } while (j < non_zero_width);
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+                                    int row_shift) {
+  // vqrshlq_s16 will shift right if shift value is negative.
+  row_shift = -row_shift;
+
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const int16x8_t residual = vld1q_s16(&source[i]);
+      vst1q_s16(&source[i], vqrshlq_s16(residual, vdupq_n_s16(row_shift)));
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      for (int j = 0; j < tx_width; j += 8) {
+        const int16x8_t residual = vld1q_s16(&source[i * tx_width + j]);
+        const int16x8_t residual_shifted =
+            vqrshlq_s16(residual, vdupq_n_s16(row_shift));
+        vst1q_s16(&source[i * tx_width + j], residual_shifted);
+      }
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int16_t* source, TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+
+  // Enable for 4x4, 4x8, 4x16
+  if (tx_height < 32 && tx_width == 4) {
+    uint8x8_t frame_data = vdup_n_u8(0);
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const int16x4_t residual = vld1_s16(&source[row]);
+      frame_data = Load4<0>(dst, frame_data);
+      const int16x4_t a = vrshr_n_s16(residual, 4);
+      const uint16x8_t b =
+          vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      StoreLo4(dst, d);
+      dst += stride;
+    }
+    // Enable for 8x4, 8x8, 8x16, 8x32
+  } else if (tx_height < 64 && tx_width == 8) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+      const int16x8_t residual = vld1q_s16(&source[row]);
+      const uint8x8_t frame_data = vld1_u8(dst);
+      const int16x8_t a = vrshrq_n_s16(residual, 4);
+      const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      vst1_u8(dst, d);
+      dst += stride;
+    }
+    // Remaining widths >= 16.
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const int16x8_t residual = vld1q_s16(&source[row + j]);
+        const int16x8_t residual_hi = vld1q_s16(&source[row + j + 8]);
+        const uint8x16_t frame_data = vld1q_u8(frame[y] + x);
+        const int16x8_t a = vrshrq_n_s16(residual, 4);
+        const int16x8_t a_hi = vrshrq_n_s16(residual_hi, 4);
+        const uint16x8_t b =
+            vaddw_u8(vreinterpretq_u16_s16(a), vget_low_u8(frame_data));
+        const uint16x8_t b_hi =
+            vaddw_u8(vreinterpretq_u16_s16(a_hi), vget_high_u8(frame_data));
+        vst1q_u8(frame[y] + x,
+                 vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(b)),
+                             vqmovun_s16(vreinterpretq_s16_u16(b_hi))));
+        j += 16;
+      } while (j < tx_width);
+    }
+  }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = (tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct4 rows in parallel.
+    Dct4_NEON<ButterflyRotation_4, false>(src, /*step=*/4, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct4 rows in parallel per iteration.
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_8, true>(data, /*step=*/4,
+                                           /*transpose=*/true);
+      data += 32;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct4 columns in parallel.
+      Dct4_NEON<ButterflyRotation_4, false>(src, tx_width, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct4 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Dct4_NEON<ButterflyRotation_8, true>(data, tx_width,
+                                             /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct8 rows in parallel.
+    Dct8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct8 rows in parallel per iteration.
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+                                            /*transpose=*/true);
+      data += 64;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct8 columns in parallel.
+      Dct8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct8 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Dct8_NEON<ButterflyRotation_8, false>(data, tx_width,
+                                              /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct16 rows in parallel.
+    Dct16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+  } else {
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    do {
+      // Process 8 1d dct16 rows in parallel per iteration.
+      Dct16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+                                             row_shift);
+      src += 128;
+      i -= 8;
+    } while (i != 0);
+  }
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct16 columns in parallel.
+      Dct16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+                                            /*row_shift=*/0);
+    } else {
+      int i = tx_width;
+      auto* data = src;
+      do {
+        // Process 8 1d dct16 columns in parallel per iteration.
+        Dct16_NEON<ButterflyRotation_8, false>(data, tx_width, /*is_row=*/false,
+                                               /*row_shift=*/0);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct32 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct32_NEON(&src[i * 32], 32, /*is_row=*/true, row_shift);
+    i += 8;
+  } while (i < adjusted_tx_height);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 8;
+      i -= 8;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct64 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct64_NEON(&src[i * 64], 64, /*is_row=*/true, row_shift);
+    i += 8;
+  } while (i < adjusted_tx_height);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 8;
+      i -= 8;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst4_NEON<false>(data, tx_width, /*transpose=*/false);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d adst8 rows in parallel.
+    Adst8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d adst8 rows in parallel per iteration.
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+                                             /*transpose=*/true);
+      data += 64;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst8 columns in parallel.
+      Adst8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d adst8 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Adst8_NEON<ButterflyRotation_8, false>(data, tx_width,
+                                               /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d adst16 rows in parallel.
+    Adst16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+  } else {
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    do {
+      // Process 8 1d adst16 rows in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+                                              row_shift);
+      src += 128;
+      i -= 8;
+    } while (i != 0);
+  }
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst16 columns in parallel.
+      Adst16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+                                             /*row_shift=*/0);
+    } else {
+      int i = tx_width;
+      auto* data = src;
+      do {
+        // Process 8 1d adst16 columns in parallel per iteration.
+        Adst16_NEON<ButterflyRotation_8, false>(
+            data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                       tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+  if (tx_height < 16) {
+    int i = adjusted_tx_height;
+    do {
+      Identity4_NEON<false>(src, /*step=*/4);
+      src += 16;
+      i -= 4;
+    } while (i != 0);
+  } else {
+    int i = adjusted_tx_height;
+    do {
+      Identity4_NEON<true>(src, /*step=*/4);
+      src += 16;
+      i -= 4;
+    } while (i != 0);
+  }
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A.
+  if ((tx_height & 0x18) != 0) {
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, kTransformRowShift[tx_size]);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* src_buffer, int start_x,
+                                        int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16.  The src is always rounded before the
+  // identity transform and shifted by 1 afterwards.
+  auto* src = static_cast<int16_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = adjusted_tx_height;
+  do {
+    Identity32Row16_NEON(src, /*step=*/32);
+    src += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* src_buffer, int start_x,
+                                        int start_y, void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+                               int /*adjusted_tx_height*/, void* /*src_buffer*/,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int16_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  uint8_t* dst = frame[start_y] + start_x;
+  const int dst_stride = frame.columns();
+  Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      Identity32TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      Identity32TransformLoopColumn_NEON;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      Wht4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      Wht4TransformLoopColumn_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
new file mode 100644
index 0000000..af647e8
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_neon.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
new file mode 100644
index 0000000..146c983
--- /dev/null
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -0,0 +1,1190 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
+  const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
+  return vorr_u8(a, RightShift<32>(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                                const uint8_t outer_thresh) {
+  const uint8x8x2_t a = Interleave32(p0q0, p1q1);
+  const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
+  const uint8x8_t p0q0_double = vqadd_u8(b, b);
+  const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
+  const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
+  return vcle_u8(c, vdup_n_u8(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   OuterThreshhold()
+inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                         const uint8_t hev_thresh, const uint8_t outer_thresh,
+                         const uint8_t inner_thresh, uint8x8_t* const hev_mask,
+                         uint8x8_t* const needs_filter4_mask) {
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  // This includes cases where NeedsFilter4() is not true and so Filter2() will
+  // not be applied.
+  const uint8x8_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask =
+      NeedsFilter4(p0p1_q0q1, p0q0, p1q1, inner_thresh, outer_thresh);
+
+  // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+  *hev_mask = vand_u8(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
+                    const uint8x8_t hev_mask, uint8x8_t* const p1q1_result,
+                    uint8x8_t* const p0q0_result) {
+  const int16x4_t zero = vdup_n_s16(0);
+
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubl_u8(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int8x8_t p1mq1_saturated = vqmovn_s16(vcombine_s16(p1mq1, zero));
+  const int8x8_t hev_option =
+      vand_s8(vreinterpret_s8_u8(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a =
+      vget_low_s16(vaddw_s8(vcombine_s16(q0mp0_3, zero), hev_option));
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four = vadd_s16(a, vdup_n_s16(4));
+  const int16x4_t plus_three = vadd_s16(a, vdup_n_s16(3));
+  const int8x8_t a2_a1 =
+      vshr_n_s8(vqmovn_s16(vcombine_s16(plus_three, plus_four)), 3);
+
+  // a3 is in the high 4 values.
+  // a3 = (a1 + 1) >> 1;
+  const int8x8_t a3 = vrshr_n_s8(a2_a1, 1);
+
+  const int16x8_t p0q1_l = vreinterpretq_s16_u16(vmovl_u8(p0q1));
+  const int16x8_t q0p1_l = vreinterpretq_s16_u16(vmovl_u8(q0p1));
+
+  const int16x8_t p1q1_l =
+      vcombine_s16(vget_high_s16(q0p1_l), vget_high_s16(p0q1_l));
+
+  const int8x8_t a3_ma3 = InterleaveHigh32(a3, vneg_s8(a3));
+  const int16x8_t p1q1_a3 = vaddw_s8(p1q1_l, a3_ma3);
+
+  const int16x8_t p0q0_l =
+      vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
+  // Need to shift the second term or we end up with a2_ma2.
+  const int8x8_t a2_ma1 =
+      InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
+  const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
+
+  *p1q1_result = vqmovun_s16(p1q1_a3);
+  *p0q0_result = vqmovun_s16(p0q0_a);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+
+  uint8x8_t hev_mask;
+  uint8x8_t needs_filter4_mask;
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_thresh, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+  // Copy the masks to the high bits for packed comparisons later.
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+  needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint8x8_t p0q0_output = vbsl_u8(needs_filter4_mask, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+  const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1);
+
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+  StoreHi4(dst + stride, p1q1_output);
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  // Move |dst| to the left side of the filter window.
+  dst -= 2;
+
+  // |p1q0| and |p0q1| are named for the values they will contain after the
+  // transpose.
+  const uint8x8_t row0 = Load4(dst);
+  uint8x8_t p1q0 = Load4<1>(dst + stride, row0);
+  const uint8x8_t row2 = Load4(dst + 2 * stride);
+  uint8x8_t p0q1 = Load4<1>(dst + 3 * stride, row2);
+
+  Transpose4x4(&p1q0, &p0q1);
+  // Rearrange.
+  const uint8x8x2_t p1q1xq0p0 = Interleave32(p1q0, Transpose32(p0q1));
+  const uint8x8x2_t p1q1xp0q0 = {p1q1xq0p0.val[0],
+                                 Transpose32(p1q1xq0p0.val[1])};
+
+  uint8x8_t hev_mask;
+  uint8x8_t needs_filter4_mask;
+  Filter4Masks(p1q1xp0q0.val[1], p1q1xp0q0.val[0], hev_thresh, outer_thresh,
+               inner_thresh, &hev_mask, &needs_filter4_mask);
+
+  // Copy the masks to the high bits for packed comparisons later.
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+  needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  Filter4(Transpose32(p1q0), p0q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint8x8_t p0q0_output =
+      vbsl_u8(needs_filter4_mask, f_p0q0, p1q1xp0q0.val[1]);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+  const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1xp0q0.val[0]);
+
+  // Put things back in order to reverse the transpose.
+  const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+  uint8x8_t output_0 = p1p0xq1q0.val[0],
+            output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+  Transpose4x4(&output_0, &output_1);
+
+  StoreLo4(dst, output_0);
+  StoreLo4(dst + stride, output_1);
+  StoreHi4(dst + 2 * stride, output_0);
+  StoreHi4(dst + 3 * stride, output_1);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
+                         const uint8x8_t abd_p0p2_q0q2) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
+  return vand_u8(b, RightShift<32>(b));
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   OuterThreshhold()
+inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t abd_p1p2_q1q2,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter6Masks(const uint8x8_t p2q2, const uint8x8_t p1q1,
+                         const uint8x8_t p0q0, const uint8_t hev_thresh,
+                         const uint8_t outer_thresh, const uint8_t inner_thresh,
+                         uint8x8_t* const needs_filter6_mask,
+                         uint8x8_t* const is_flat3_mask,
+                         uint8x8_t* const hev_mask) {
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = IsFlat3(p0p1_q0q1, vabd_u8(p0q0, p2q2));
+  *needs_filter6_mask = NeedsFilter6(p0p1_q0q1, vabd_u8(p1q1, p2q2), p0q0, p1q1,
+                                     inner_thresh, outer_thresh);
+}
+
+inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1,
+                    const uint8x8_t p0q0, uint8x8_t* const p1q1_output,
+                    uint8x8_t* const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+  //                                 ^^^^^^^^
+  const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2);
+  uint16x8_t sum = vaddw_u8(p2q2_double, p2q2);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                 ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+  //                      ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                            ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+  //           ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                                       ^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+  //      ^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p1q1_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(vaddl_u8(q0p0, q1p1), sum);
+
+  *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+
+  uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+  needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+  is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter6_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+    // Filter6() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f6_p1q1 = zero;
+    f6_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  // Move |dst| to the left side of the filter window.
+  dst -= 3;
+
+  // |p2q1|, |p1q2|, |p0xx| and |q0xx| are named for the values they will
+  // contain after the transpose.
+  // These over-read by 2 bytes. We only need 6.
+  uint8x8_t p2q1 = vld1_u8(dst);
+  uint8x8_t p1q2 = vld1_u8(dst + stride);
+  uint8x8_t p0xx = vld1_u8(dst + 2 * stride);
+  uint8x8_t q0xx = vld1_u8(dst + 3 * stride);
+
+  Transpose8x4(&p2q1, &p1q2, &p0xx, &q0xx);
+
+  const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+  const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+  const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+  const uint8x8_t p0q0 = InterleaveLow32(p0xx, q0xx);
+
+  uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+  needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+  is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter6_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+    // Filter6() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f6_p1q1 = zero;
+    f6_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+
+  // The six tap filter is only six taps on input. Output is limited to p1-q1.
+  dst += 1;
+  // Put things back in order to reverse the transpose.
+  const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+  uint8x8_t output_0 = p1p0xq1q0.val[0];
+  uint8x8_t output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+  Transpose4x4(&output_0, &output_1);
+
+  StoreLo4(dst, output_0);
+  StoreLo4(dst + stride, output_1);
+  StoreHi4(dst + 2 * stride, output_0);
+  StoreHi4(dst + 3 * stride, output_1);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
+                         const uint8x8_t abd_p0n1_q0n1,
+                         const uint8x8_t abd_p0n2_q0n2) {
+  const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
+  const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
+  const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
+  return vand_u8(c, RightShift<32>(c));
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   OuterThreshhold()
+inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t abd_p1p2_q1q2,
+                              const uint8x8_t abd_p2p3_q2q3,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
+  const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter8Masks(const uint8x8_t p3q3, const uint8x8_t p2q2,
+                         const uint8x8_t p1q1, const uint8x8_t p0q0,
+                         const uint8_t hev_thresh, const uint8_t outer_thresh,
+                         const uint8_t inner_thresh,
+                         uint8x8_t* const needs_filter8_mask,
+                         uint8x8_t* const is_flat4_mask,
+                         uint8x8_t* const hev_mask) {
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+  *is_flat4_mask = IsFlat4(p0p1_q0q1, vabd_u8(p0q0, p2q2), vabd_u8(p0q0, p3q3));
+  *needs_filter8_mask =
+      NeedsFilter8(p0p1_q0q1, vabd_u8(p1q1, p2q2), vabd_u8(p2q2, p3q3), p0q0,
+                   p1q1, inner_thresh, outer_thresh);
+}
+
+inline void Filter8(const uint8x8_t p3q3, const uint8x8_t p2q2,
+                    const uint8x8_t p1q1, const uint8x8_t p0q0,
+                    uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+                    uint8x8_t* const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  uint16x8_t sum = vaddw_u8(vaddl_u8(p3q3, p3q3), p3q3);
+
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //                 ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                     ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p2q2, p2q2), sum);
+
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //                            ^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
+
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //                                      ^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //      ^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p2q2_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, vaddl_u8(p3q3, p2q2));
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(vaddl_u8(p1q1, q1p1), sum);
+
+  *p1q1_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddl_u8(p3q3, p1q1));
+  const uint8x8_t q2p2 = Transpose32(p2q2);
+  sum = vaddq_u16(vaddl_u8(p0q0, q2p2), sum);
+
+  *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p3_v = Load4(dst - 4 * stride);
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+  const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p2q2 = zero;
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+    const uint8x8_t p2p2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+    StoreLo4(dst - 3 * stride, p2p2_output);
+    StoreHi4(dst + 2 * stride, p2p2_output);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  // Move |dst| to the left side of the filter window.
+  dst -= 4;
+
+  // |p3q0|, |p2q1|, |p1q2| and |p0q3| are named for the values they will
+  // contain after the transpose.
+  uint8x8_t p3q0 = vld1_u8(dst);
+  uint8x8_t p2q1 = vld1_u8(dst + stride);
+  uint8x8_t p1q2 = vld1_u8(dst + 2 * stride);
+  uint8x8_t p0q3 = vld1_u8(dst + 3 * stride);
+
+  Transpose8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+  const uint8x8x2_t p3q3xq0p0 = Interleave32(p3q0, Transpose32(p0q3));
+  const uint8x8_t p3q3 = p3q3xq0p0.val[0];
+  const uint8x8_t p0q0 = Transpose32(p3q3xq0p0.val[1]);
+  const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+  const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+  const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p2q2 = zero;
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  // Always prepare and store p2/q2 because we need to transpose it anyway.
+  const uint8x8_t p2q2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+  // Write out p3/q3 as well. There isn't a good way to write out 6 bytes.
+  // Variable names reflect the values before transposition.
+  const uint8x8x2_t p3q0xq3p0_output =
+      Interleave32(p3q3, Transpose32(p0q0_output));
+  uint8x8_t p3q0_output = p3q0xq3p0_output.val[0];
+  uint8x8_t p0q3_output = Transpose32(p3q0xq3p0_output.val[1]);
+  const uint8x8x2_t p2q1xq2p1_output =
+      Interleave32(p2q2_output, Transpose32(p1q1_output));
+  uint8x8_t p2q1_output = p2q1xq2p1_output.val[0];
+  uint8x8_t p1q2_output = Transpose32(p2q1xq2p1_output.val[1]);
+
+  Transpose8x4(&p3q0_output, &p2q1_output, &p1q2_output, &p0q3_output);
+
+  vst1_u8(dst, p3q0_output);
+  vst1_u8(dst + stride, p2q1_output);
+  vst1_u8(dst + 2 * stride, p1q2_output);
+  vst1_u8(dst + 3 * stride, p0q3_output);
+}
+
+inline void Filter14(const uint8x8_t p6q6, const uint8x8_t p5q5,
+                     const uint8x8_t p4q4, const uint8x8_t p3q3,
+                     const uint8x8_t p2q2, const uint8x8_t p1q1,
+                     const uint8x8_t p0q0, uint8x8_t* const p5q5_output,
+                     uint8x8_t* const p4q4_output, uint8x8_t* const p3q3_output,
+                     uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+                     uint8x8_t* const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  uint16x8_t sum = vsubw_u8(vshll_n_u8(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                          ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p5q5, p5q5), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                            ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p4q4, p4q4), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p5q5_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p6q6));
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(vaddl_u8(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p5q5));
+  const uint8x8_t q2p2 = Transpose32(p2q2);
+  sum = vaddq_u16(vaddl_u8(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p4q4));
+  const uint8x8_t q3p3 = Transpose32(p3q3);
+  sum = vaddq_u16(vaddl_u8(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p3q3));
+  const uint8x8_t q4p4 = Transpose32(p4q4);
+  sum = vaddq_u16(vaddl_u8(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p2q2));
+  const uint8x8_t q5p5 = Transpose32(p5q5);
+  sum = vaddq_u16(vaddl_u8(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrn_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+                       const int outer_thresh, const int inner_thresh,
+                       const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p6_v = Load4(dst - 7 * stride);
+  const uint8x8_t p5_v = Load4(dst - 6 * stride);
+  const uint8x8_t p4_v = Load4(dst - 5 * stride);
+  const uint8x8_t p3_v = Load4(dst - 4 * stride);
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+  const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+  const uint8x8_t p4q4 = Load4<1>(dst + 4 * stride, p4_v);
+  const uint8x8_t p5q5 = Load4<1>(dst + 5 * stride, p5_v);
+  const uint8x8_t p6q6 = Load4<1>(dst + 6 * stride, p6_v);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Decide between Filter8() and Filter14().
+  uint8x8_t is_flat_outer4_mask =
+      IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+  is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+  is_flat_outer4_mask =
+      InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p1q1, f8_p0q0;
+  uint8x8_t f14_p2q2, f14_p1q1, f14_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() and Filter14() do not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+    f14_p1q1 = zero;
+    f14_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    uint8x8_t f8_p2q2;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+    if (vaddv_u8(is_flat_outer4_mask) == 0) {
+      // Filter14() does not apply.
+      const uint8x8_t zero = vdup_n_u8(0);
+      f14_p2q2 = zero;
+      f14_p1q1 = zero;
+      f14_p0q0 = zero;
+    } else {
+#endif  // defined(__aarch64__)
+      uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+      const uint8x8_t p5q5_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+      StoreLo4(dst - 6 * stride, p5q5_output);
+      StoreHi4(dst + 5 * stride, p5q5_output);
+
+      const uint8x8_t p4q4_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+      StoreLo4(dst - 5 * stride, p4q4_output);
+      StoreHi4(dst + 4 * stride, p4q4_output);
+
+      const uint8x8_t p3q3_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+      StoreLo4(dst - 4 * stride, p3q3_output);
+      StoreHi4(dst + 3 * stride, p3q3_output);
+#if defined(__aarch64__)
+    }
+#endif  // defined(__aarch64__)
+
+    uint8x8_t p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+    p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+    StoreLo4(dst - 3 * stride, p2q2_output);
+    StoreHi4(dst + 2 * stride, p2q2_output);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+  p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+  p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
+                     const int outer_thresh, const int inner_thresh,
+                     const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  dst -= 8;
+  // input
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  const uint8x16_t x0 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x1 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x2 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x3 = vld1q_u8(dst);
+  dst -= (stride * 3);
+
+  // re-order input
+#if defined(__aarch64__)
+  const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+  const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+  const uint8x16_t index_qp7toqp0 = vcombine_u8(index_qp3toqp0, index_qp7toqp4);
+
+  uint8x16_t input_0 = vqtbl1q_u8(x0, index_qp7toqp0);
+  uint8x16_t input_1 = vqtbl1q_u8(x1, index_qp7toqp0);
+  uint8x16_t input_2 = vqtbl1q_u8(x2, index_qp7toqp0);
+  uint8x16_t input_3 = vqtbl1q_u8(x3, index_qp7toqp0);
+#else
+  const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+  const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+
+  const uint8x8_t x0_qp3qp0 = VQTbl1U8(x0, index_qp3toqp0);
+  const uint8x8_t x1_qp3qp0 = VQTbl1U8(x1, index_qp3toqp0);
+  const uint8x8_t x2_qp3qp0 = VQTbl1U8(x2, index_qp3toqp0);
+  const uint8x8_t x3_qp3qp0 = VQTbl1U8(x3, index_qp3toqp0);
+
+  const uint8x8_t x0_qp7qp4 = VQTbl1U8(x0, index_qp7toqp4);
+  const uint8x8_t x1_qp7qp4 = VQTbl1U8(x1, index_qp7toqp4);
+  const uint8x8_t x2_qp7qp4 = VQTbl1U8(x2, index_qp7toqp4);
+  const uint8x8_t x3_qp7qp4 = VQTbl1U8(x3, index_qp7toqp4);
+
+  const uint8x16_t input_0 = vcombine_u8(x0_qp3qp0, x0_qp7qp4);
+  const uint8x16_t input_1 = vcombine_u8(x1_qp3qp0, x1_qp7qp4);
+  const uint8x16_t input_2 = vcombine_u8(x2_qp3qp0, x2_qp7qp4);
+  const uint8x16_t input_3 = vcombine_u8(x3_qp3qp0, x3_qp7qp4);
+#endif
+  // input after re-order
+  // p0 p1 p2 p3 q0 q1 q2 q3  p4 p5 p6 p7 q4 q5 q6 q7
+
+  const uint8x16x2_t in01 = vtrnq_u8(input_0, input_1);
+  const uint8x16x2_t in23 = vtrnq_u8(input_2, input_3);
+  const uint16x8x2_t in02 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[0]),
+                                      vreinterpretq_u16_u8(in23.val[0]));
+  const uint16x8x2_t in13 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[1]),
+                                      vreinterpretq_u16_u8(in23.val[1]));
+
+  const uint8x8_t p0q0 = vget_low_u8(vreinterpretq_u8_u16(in02.val[0]));
+  const uint8x8_t p1q1 = vget_low_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+  const uint8x8_t p2q2 = vget_low_u8(vreinterpretq_u8_u16(in02.val[1]));
+  const uint8x8_t p3q3 = vget_low_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+  const uint8x8_t p4q4 = vget_high_u8(vreinterpretq_u8_u16(in02.val[0]));
+  const uint8x8_t p5q5 = vget_high_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+  const uint8x8_t p6q6 = vget_high_u8(vreinterpretq_u8_u16(in02.val[1]));
+  const uint8x8_t p7q7 = vget_high_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Decide between Filter8() and Filter14().
+  uint8x8_t is_flat_outer4_mask =
+      IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+  is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+  is_flat_outer4_mask =
+      InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+  uint8x8_t f_p0q0, f_p1q1;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t p1q1_output, p0q0_output;
+  uint8x8_t p5q5_output, p4q4_output, p3q3_output, p2q2_output;
+
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() and Filter14() do not apply.
+    p1q1_output = p1q1;
+    p0q0_output = p0q0;
+
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+  } else {
+#endif  // defined(__aarch64__)
+    uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+    if (vaddv_u8(is_flat_outer4_mask) == 0) {
+      // Filter14() does not apply.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = f8_p2q2;
+      p1q1_output = f8_p1q1;
+      p0q0_output = f8_p0q0;
+    } else {
+#endif  // defined(__aarch64__)
+      uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+      p5q5_output = vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+      p4q4_output = vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+      p3q3_output = vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+      p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+      p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+      p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+#if defined(__aarch64__)
+    }
+#endif  // defined(__aarch64__)
+    p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+  const uint8x16_t p0q0_p4q4 = vcombine_u8(p0q0_output, p4q4_output);
+  const uint8x16_t p2q2_p6q6 = vcombine_u8(p2q2_output, p6q6);
+  const uint8x16_t p1q1_p5q5 = vcombine_u8(p1q1_output, p5q5_output);
+  const uint8x16_t p3q3_p7q7 = vcombine_u8(p3q3_output, p7q7);
+
+  const uint16x8x2_t out02 = vtrnq_u16(vreinterpretq_u16_u8(p0q0_p4q4),
+                                       vreinterpretq_u16_u8(p2q2_p6q6));
+  const uint16x8x2_t out13 = vtrnq_u16(vreinterpretq_u16_u8(p1q1_p5q5),
+                                       vreinterpretq_u16_u8(p3q3_p7q7));
+  const uint8x16x2_t out01 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[0]),
+                                      vreinterpretq_u8_u16(out13.val[0]));
+  const uint8x16x2_t out23 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[1]),
+                                      vreinterpretq_u8_u16(out13.val[1]));
+
+#if defined(__aarch64__)
+  const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+  const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+  const uint8x16_t index_p7toq7 = vcombine_u8(index_p7top0, index_q7toq0);
+
+  const uint8x16_t output_0 = vqtbl1q_u8(out01.val[0], index_p7toq7);
+  const uint8x16_t output_1 = vqtbl1q_u8(out01.val[1], index_p7toq7);
+  const uint8x16_t output_2 = vqtbl1q_u8(out23.val[0], index_p7toq7);
+  const uint8x16_t output_3 = vqtbl1q_u8(out23.val[1], index_p7toq7);
+#else
+  const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+  const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+
+  const uint8x8_t x0_p7p0 = VQTbl1U8(out01.val[0], index_p7top0);
+  const uint8x8_t x1_p7p0 = VQTbl1U8(out01.val[1], index_p7top0);
+  const uint8x8_t x2_p7p0 = VQTbl1U8(out23.val[0], index_p7top0);
+  const uint8x8_t x3_p7p0 = VQTbl1U8(out23.val[1], index_p7top0);
+
+  const uint8x8_t x0_q7q0 = VQTbl1U8(out01.val[0], index_q7toq0);
+  const uint8x8_t x1_q7q0 = VQTbl1U8(out01.val[1], index_q7toq0);
+  const uint8x8_t x2_q7q0 = VQTbl1U8(out23.val[0], index_q7toq0);
+  const uint8x8_t x3_q7q0 = VQTbl1U8(out23.val[1], index_q7toq0);
+
+  const uint8x16_t output_0 = vcombine_u8(x0_p7p0, x0_q7q0);
+  const uint8x16_t output_1 = vcombine_u8(x1_p7p0, x1_q7q0);
+  const uint8x16_t output_2 = vcombine_u8(x2_p7p0, x2_q7q0);
+  const uint8x16_t output_3 = vcombine_u8(x3_p7p0, x3_q7q0);
+#endif
+
+  vst1q_u8(dst, output_0);
+  dst += stride;
+  vst1q_u8(dst, output_1);
+  dst += stride;
+  vst1q_u8(dst, output_2);
+  dst += stride;
+  vst1q_u8(dst, output_3);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Horizontal4_NEON;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Horizontal6_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Horizontal8_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Vertical14_NEON;
+}
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h
new file mode 100644
index 0000000..5f79200
--- /dev/null
+++ b/src/dsp/arm/loop_filter_neon.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
new file mode 100644
index 0000000..337c9b4
--- /dev/null
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -0,0 +1,1901 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+  return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+  return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+    const RestorationUnitInfo& restoration_info, const int direction,
+    int16_t filter[4]) {
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  for (int i = 0; i < 4; ++i) {
+    filter[i] = restoration_info.wiener_info.filter[direction][i];
+  }
+  if (direction == WienerInfo::kHorizontal) {
+    filter[3] -= 128;
+  }
+}
+
+inline int16x8_t WienerHorizontal2(const uint8x8_t s0, const uint8x8_t s1,
+                                   const int16_t filter, const int16x8_t sum) {
+  const int16x8_t ss = vreinterpretq_s16_u16(vaddl_u8(s0, s1));
+  return vmlaq_n_s16(sum, ss, filter);
+}
+
+inline int16x8x2_t WienerHorizontal2(const uint8x16_t s0, const uint8x16_t s1,
+                                     const int16_t filter,
+                                     const int16x8x2_t sum) {
+  int16x8x2_t d;
+  d.val[0] =
+      WienerHorizontal2(vget_low_u8(s0), vget_low_u8(s1), filter, sum.val[0]);
+  d.val[1] =
+      WienerHorizontal2(vget_high_u8(s0), vget_high_u8(s1), filter, sum.val[1]);
+  return d;
+}
+
+inline void WienerHorizontalSum(const uint8x8_t s[3], const int16_t filter[4],
+                                int16x8_t sum, int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[2]));
+  const int16x8_t s_1 = ZeroExtend(s[1]);
+  sum = vmlaq_n_s16(sum, s_0_2, filter[2]);
+  sum = vmlaq_n_s16(sum, s_1, filter[3]);
+  // Calculate scaled down offset correction, and add to sum here to prevent
+  // signed 16 bit outranging.
+  sum = vrsraq_n_s16(vshlq_n_s16(s_1, 7 - kInterRoundBitsHorizontal), sum,
+                     kInterRoundBitsHorizontal);
+  sum = vmaxq_s16(sum, vdupq_n_s16(-offset));
+  sum = vminq_s16(sum, vdupq_n_s16(limit - offset));
+  vst1q_s16(wiener_buffer, sum);
+}
+
+inline void WienerHorizontalSum(const uint8x16_t src[3],
+                                const int16_t filter[4], int16x8x2_t sum,
+                                int16_t* const wiener_buffer) {
+  uint8x8_t s[3];
+  s[0] = vget_low_u8(src[0]);
+  s[1] = vget_low_u8(src[1]);
+  s[2] = vget_low_u8(src[2]);
+  WienerHorizontalSum(s, filter, sum.val[0], wiener_buffer);
+  s[0] = vget_high_u8(src[0]);
+  s[1] = vget_high_u8(src[1]);
+  s[2] = vget_high_u8(src[2]);
+  WienerHorizontalSum(s, filter, sum.val[1], wiener_buffer + 8);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[8];
+    s[0] = vld1q_u8(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 16;
+      s[7] = vld1q_u8(src_ptr);
+      s[1] = vextq_u8(s[0], s[7], 1);
+      s[2] = vextq_u8(s[0], s[7], 2);
+      s[3] = vextq_u8(s[0], s[7], 3);
+      s[4] = vextq_u8(s[0], s[7], 4);
+      s[5] = vextq_u8(s[0], s[7], 5);
+      s[6] = vextq_u8(s[0], s[7], 6);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+      sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+      WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+      s[0] = s[7];
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[6];
+    s[0] = vld1q_u8(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 16;
+      s[5] = vld1q_u8(src_ptr);
+      s[1] = vextq_u8(s[0], s[5], 1);
+      s[2] = vextq_u8(s[0], s[5], 2);
+      s[3] = vextq_u8(s[0], s[5], 3);
+      s[4] = vextq_u8(s[0], s[5], 4);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+      WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+      s[0] = s[5];
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[4];
+    s[0] = vld1q_u8(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 16;
+      s[3] = vld1q_u8(src_ptr);
+      s[1] = vextq_u8(s[0], s[3], 1);
+      s[2] = vextq_u8(s[0], s[3], 2);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+      s[0] = s[3];
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    ptrdiff_t x = width;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr);
+      const uint8x8_t s0 = vget_low_u8(s);
+      const uint8x8_t s1 = vget_high_u8(s);
+      const int16x8_t d0 = vreinterpretq_s16_u16(vshll_n_u8(s0, 4));
+      const int16x8_t d1 = vreinterpretq_s16_u16(vshll_n_u8(s1, 4));
+      vst1q_s16(*wiener_buffer + 0, d0);
+      vst1q_s16(*wiener_buffer + 8, d1);
+      src_ptr += 16;
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+                                   const int16_t filter,
+                                   const int32x4x2_t sum) {
+  const int16x8_t a = vaddq_s16(a0, a1);
+  int32x4x2_t d;
+  d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a), filter);
+  d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a), filter);
+  return d;
+}
+
+inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+                                const int32x4x2_t sum) {
+  int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+  const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+  const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+  return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16));
+}
+
+inline uint8x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[7]) {
+  int32x4x2_t sum;
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[6], filter[0], sum);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap7Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[8];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[7], filter[0], sum);
+  sum = WienerVertical2(a[2], a[6], filter[1], sum);
+  d.val[1] = WienerVertical(a + 3, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[7];
+      const uint8x8_t d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint8x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[5]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[4], filter[1], sum);
+  return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap5Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[6];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  d.val[1] = WienerVertical(a + 2, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[5];
+      const uint8x8_t d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint8x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[3]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  return WienerVertical(a, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap3Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[4];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  d.val[1] = WienerVertical(a + 1, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[3];
+      const uint8x8_t d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+  const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+  const uint8x8_t d0 = vqrshrun_n_s16(a0, 4);
+  const uint8x8_t d1 = vqrshrun_n_s16(a1, 4);
+  vst1q_u8(dst, vcombine_u8(d0, d1));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+      WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst);
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
+                       const void* const source, const void* const top_border,
+                       const void* const bottom_border, const ptrdiff_t stride,
+                       const int width, const int height,
+                       RestorationBuffer* const restoration_buffer,
+                       void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+  int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+  int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+                             filter_horizontal);
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+                             filter_vertical);
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+                         wiener_stride, height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+                         wiener_stride, height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+                         wiener_stride, height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+                         wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) {
+  dst[0] = VshrU128<0>(src);
+  dst[1] = VshrU128<1>(src);
+  dst[2] = VshrU128<2>(src);
+}
+
+inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
+                        uint16x4_t high[3]) {
+  uint16x8_t s[3];
+  s[0] = VshrU128<0>(src);
+  s[1] = VshrU128<2>(src);
+  s[2] = VshrU128<4>(src);
+  low[0] = vget_low_u16(s[0]);
+  low[1] = vget_low_u16(s[1]);
+  low[2] = vget_low_u16(s[2]);
+  high[0] = vget_high_u16(s[0]);
+  high[1] = vget_high_u16(s[1]);
+  high[2] = vget_high_u16(s[2]);
+}
+
+inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
+  dst[0] = VshrU128<0>(src);
+  dst[1] = VshrU128<1>(src);
+  dst[2] = VshrU128<2>(src);
+  dst[3] = VshrU128<3>(src);
+  dst[4] = VshrU128<4>(src);
+}
+
+inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5],
+                        uint16x4_t high[5]) {
+  Prepare3_16(src, low, high);
+  const uint16x8_t s3 = VshrU128<6>(src);
+  const uint16x8_t s4 = VshrU128<8>(src);
+  low[3] = vget_low_u16(s3);
+  low[4] = vget_low_u16(s4);
+  high[3] = vget_high_u16(s3);
+  high[4] = vget_high_u16(s4);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+                          const uint16x8_t src2) {
+  const uint16x8_t sum = vaddq_u16(src0, src1);
+  return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2) {
+  const uint32x4_t sum = vaddq_u32(src0, src1);
+  return vaddq_u32(sum, src2);
+}
+
+inline uint32x4x2_t Sum3_32(const uint32x4x2_t src[3]) {
+  uint32x4x2_t d;
+  d.val[0] = Sum3_32(src[0].val[0], src[1].val[0], src[2].val[0]);
+  d.val[1] = Sum3_32(src[0].val[1], src[1].val[1], src[2].val[1]);
+  return d;
+}
+
+inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(src[0], src[1]);
+  return vaddw_u8(sum, src[2]);
+}
+
+inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
+  const uint32x4_t sum = vaddl_u16(src[0], src[1]);
+  return vaddw_u16(sum, src[2]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+  const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+  const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2, const uint32x4_t src3,
+                          const uint32x4_t src4) {
+  const uint32x4_t sum01 = vaddq_u32(src0, src1);
+  const uint32x4_t sum23 = vaddq_u32(src2, src3);
+  const uint32x4_t sum = vaddq_u32(sum01, sum23);
+  return vaddq_u32(sum, src4);
+}
+
+inline uint32x4x2_t Sum5_32(const uint32x4x2_t src[5]) {
+  uint32x4x2_t d;
+  d.val[0] = Sum5_32(src[0].val[0], src[1].val[0], src[2].val[0], src[3].val[0],
+                     src[4].val[0]);
+  d.val[1] = Sum5_32(src[0].val[1], src[1].val[1], src[2].val[1], src[3].val[1],
+                     src[4].val[1]);
+  return d;
+}
+
+inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
+  const uint32x4_t sum01 = vaddl_u16(src[0], src[1]);
+  const uint32x4_t sum23 = vaddl_u16(src[2], src[3]);
+  const uint32x4_t sum0123 = vaddq_u32(sum01, sum23);
+  return vaddw_u16(sum0123, src[4]);
+}
+
+inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) {
+  uint8x8_t s[3];
+  Prepare3_8(src, s);
+  return Sum3W_16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t sum;
+  Prepare3_16(src, low, high);
+  sum.val[0] = Sum3W_32(low);
+  sum.val[1] = Sum3W_32(high);
+  return sum;
+}
+
+inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
+  uint8x8_t s[5];
+  Prepare5_8(src, s);
+  const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
+  const uint16x8_t sum23 = vaddl_u8(s[2], s[3]);
+  const uint16x8_t sum0123 = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum0123, s[4]);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
+  uint16x4_t low[5], high[5];
+  Prepare5_16(src, low, high);
+  uint32x4x2_t sum;
+  sum.val[0] = Sum5W_32(low);
+  sum.val[1] = Sum5W_32(high);
+  return sum;
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+                   uint32x4_t* const row_sq5) {
+  const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+  const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+  *row_sq3 = vaddw_u16(sum12, src[3]);
+  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq,
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  uint8x8_t s[5];
+  Prepare5_8(src, s);
+  const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
+  const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
+  *row3 = vaddw_u8(sum12, s[3]);
+  *row5 = vaddq_u16(sum04, *row3);
+  uint16x4_t low[5], high[5];
+  Prepare5_16(sq, low, high);
+  SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
+  SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
+}
+
+inline uint16x8_t Sum343(const uint8x8x2_t src) {
+  uint8x8_t s[3];
+  Prepare3_8(src, s);
+  const uint16x8_t sum = Sum3W_16(s);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return vaddw_u8(sum3, s[1]);
+}
+
+inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
+  const uint32x4_t sum = Sum3W_32(src);
+  const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+  return vaddw_u16(sum3, src[1]);
+}
+
+inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t d;
+  Prepare3_16(src, low, high);
+  d.val[0] = Sum343W(low);
+  d.val[1] = Sum343W(high);
+  return d;
+}
+
+inline uint16x8_t Sum565(const uint8x8x2_t src) {
+  uint8x8_t s[3];
+  Prepare3_8(src, s);
+  const uint16x8_t sum = Sum3W_16(s);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return vaddw_u8(sum5, s[1]);
+}
+
+inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
+  const uint32x4_t sum = Sum3W_32(src);
+  const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+  const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+  return vaddw_u16(sum5, src[1]);
+}
+
+inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t d;
+  Prepare3_16(src, low, high);
+  d.val[0] = Sum565W(low);
+  d.val[1] = Sum565W(high);
+  return d;
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const int height, const ptrdiff_t sum_stride, uint16_t* sum3,
+                   uint16_t* sum5, uint32_t* square_sum3,
+                   uint32_t* square_sum5) {
+  int y = height;
+  do {
+    uint8x8x2_t s;
+    uint16x8x2_t sq;
+    s.val[0] = vld1_u8(src);
+    sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+    ptrdiff_t x = 0;
+    do {
+      uint16x8_t row3, row5;
+      uint32x4x2_t row_sq3, row_sq5;
+      s.val[1] = vld1_u8(src + x + 8);
+      sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+      SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
+      vst1q_u16(sum3, row3);
+      vst1q_u16(sum5, row5);
+      vst1q_u32(square_sum3 + 0, row_sq3.val[0]);
+      vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
+      vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
+      vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
+      s.val[0] = s.val[1];
+      sq.val[0] = sq.val[1];
+      sum3 += 8;
+      sum5 += 8;
+      square_sum3 += 8;
+      square_sum5 += 8;
+      x += 8;
+    } while (x < sum_stride);
+    src += src_stride;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const int height, const ptrdiff_t sum_stride, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int y = height;
+  do {
+    uint8x8x2_t s;
+    uint16x8x2_t sq;
+    s.val[0] = vld1_u8(src);
+    sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+    ptrdiff_t x = 0;
+    do {
+      uint16x8_t row;
+      uint32x4x2_t row_sq;
+      s.val[1] = vld1_u8(src + x + 8);
+      sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+      if (size == 3) {
+        row = Sum3Horizontal(s);
+        row_sq = Sum3WHorizontal(sq);
+      } else {
+        row = Sum5Horizontal(s);
+        row_sq = Sum5WHorizontal(sq);
+      }
+      vst1q_u16(sums, row);
+      vst1q_u32(square_sums + 0, row_sq.val[0]);
+      vst1q_u32(square_sums + 4, row_sq.val[1]);
+      s.val[0] = s.val[1];
+      sq.val[0] = sq.val[1];
+      sums += 8;
+      square_sums += 8;
+      x += 8;
+    } while (x < sum_stride);
+    src += src_stride;
+  } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+                              const uint32_t scale) {
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const uint32x4_t dxd = vmull_u16(sum, sum);
+  const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+  // Ensure |p| does not underflow by using saturating subtraction.
+  const uint32x4_t p = vqsubq_u32(axn, dxd);
+  const uint32x4_t pxs = vmulq_n_u32(p, scale);
+  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+  // is 20.
+  const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+  return vmovn_u32(shifted);
+}
+
+template <int n>
+inline void CalculateIntermediate(const uint16x8_t sum,
+                                  const uint32x4x2_t sum_sq,
+                                  const uint32_t scale, uint8x8_t* const ma,
+                                  uint16x8_t* const b) {
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
+  const uint16x4_t z1 =
+      CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
+  const uint16x8_t z01 = vcombine_u16(z0, z1);
+  // Using vqmovn_u16() needs an extra sign extension instruction.
+  const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
+  // Using vgetq_lane_s16() can save the sign extension instruction.
+  const uint8_t lookup[8] = {
+      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
+      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)],
+      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)],
+      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)],
+      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)],
+      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)],
+      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
+      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
+  *ma = vld1_u8(lookup);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq = vmovl_u8(*ma);
+  const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
+  const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
+  const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+  const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+  const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
+  const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
+  *b = vcombine_u16(b_lo, b_hi);
+}
+
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+                                   const uint32x4x2_t sq5[5],
+                                   const uint32_t scale, uint8x8_t* const ma,
+                                   uint16x8_t* const b) {
+  const uint16x8_t sum = Sum5_16(s5);
+  const uint32x4x2_t sum_sq = Sum5_32(sq5);
+  CalculateIntermediate<25>(sum, sum_sq, scale, ma, b);
+}
+
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+                                   const uint32x4x2_t sq3[3],
+                                   const uint32_t scale, uint8x8_t* const ma,
+                                   uint16x8_t* const b) {
+  const uint16x8_t sum = Sum3_16(s3);
+  const uint32x4x2_t sum_sq = Sum3_32(sq3);
+  CalculateIntermediate<9>(sum, sum_sq, scale, ma, b);
+}
+
+inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+                         const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                         uint16x8_t* const sum_ma444,
+                         uint32x4x2_t* const sum_b343,
+                         uint32x4x2_t* const sum_b444, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  uint8x8_t s[3];
+  Prepare3_8(ma3, s);
+  const uint16x8_t sum_ma111 = Sum3W_16(s);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = vaddw_u8(sum333, s[1]);
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t sum_b111;
+  Prepare3_16(b3, low, high);
+  sum_b111.val[0] = Sum3W_32(low);
+  sum_b111.val[1] = Sum3W_32(high);
+  sum_b444->val[0] = vshlq_n_u32(sum_b111.val[0], 2);
+  sum_b444->val[1] = vshlq_n_u32(sum_b111.val[1], 2);
+  sum_b343->val[0] = vsubq_u32(sum_b444->val[0], sum_b111.val[0]);
+  sum_b343->val[1] = vsubq_u32(sum_b444->val[1], sum_b111.val[1]);
+  sum_b343->val[0] = vaddw_u16(sum_b343->val[0], low[1]);
+  sum_b343->val[1] = vaddw_u16(sum_b343->val[1], high[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  vst1q_u32(b343 + x + 0, sum_b343->val[0]);
+  vst1q_u32(b343 + x + 4, sum_b343->val[1]);
+  vst1q_u32(b444 + x + 0, sum_b444->val[0]);
+  vst1q_u32(b444 + x + 4, sum_b444->val[1]);
+}
+
+inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+                         const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                         uint32x4x2_t* const sum_b343, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4x2_t sum_b444;
+  Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343,
+               ma444, b343, b444);
+}
+
+inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+                         const ptrdiff_t x, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4x2_t sum_b343;
+  Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2],
+    uint8x8_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  s[0].val[1] = vld1_u8(src0 + x + 8);
+  s[1].val[1] = vld1_u8(src1 + x + 8);
+  sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
+  sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
+  s5[3] = Sum5Horizontal(s[0]);
+  s5[4] = Sum5Horizontal(s[1]);
+  sq5[3] = Sum5WHorizontal(sq[0]);
+  sq5[4] = Sum5WHorizontal(sq[1]);
+  vst1q_u16(sum5[3] + x, s5[3]);
+  vst1q_u16(sum5[4] + x, s5[4]);
+  vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
+  s5[0] = vld1q_u16(sum5[0] + x);
+  s5[1] = vld1q_u16(sum5[1] + x);
+  s5[2] = vld1q_u16(sum5[2] + x);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
+    uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  s->val[1] = vld1_u8(src + x + 8);
+  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
+  s5[3] = s5[4] = Sum5Horizontal(*s);
+  sq5[3] = sq5[4] = Sum5WHorizontal(*sq);
+  s5[0] = vld1q_u16(sum5[0] + x);
+  s5[1] = vld1q_u16(sum5[1] + x);
+  s5[2] = vld1q_u16(sum5[2] + x);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
+    uint16x8_t* const b) {
+  uint16x8_t s3[3];
+  uint32x4x2_t sq3[3];
+  s->val[1] = vld1_u8(src + x + 8);
+  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
+  s3[2] = Sum3Horizontal(*s);
+  sq3[2] = Sum3WHorizontal(*sq);
+  vst1q_u16(sum3[2] + x, s3[2]);
+  vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
+  s3[0] = vld1q_u16(sum3[0] + x);
+  s3[1] = vld1q_u16(sum3[1] + x);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0,
+    uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1,
+    uint8x8_t* const ma5, uint16x8_t* const b5) {
+  uint16x8_t s3[4], s5[5];
+  uint32x4x2_t sq3[4], sq5[5];
+  s[0].val[1] = vld1_u8(src0 + x + 8);
+  s[1].val[1] = vld1_u8(src1 + x + 8);
+  sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
+  sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
+  SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x, s3[2]);
+  vst1q_u16(sum3[3] + x, s3[3]);
+  vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
+  vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
+  vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
+  vst1q_u16(sum5[3] + x, s5[3]);
+  vst1q_u16(sum5[4] + x, s5[4]);
+  vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
+  s3[0] = vld1q_u16(sum3[0] + x);
+  s3[1] = vld1q_u16(sum3[1] + x);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+  s5[0] = vld1q_u16(sum5[0] + x);
+  s5[1] = vld1q_u16(sum5[1] + x);
+  s5[2] = vld1q_u16(sum5[2] + x);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0);
+  CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3,
+    uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4x2_t sq3[3], sq5[5];
+  s->val[1] = vld1_u8(src + x + 8);
+  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
+  SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  s5[0] = vld1q_u16(sum5[0] + x);
+  s5[1] = vld1q_u16(sum5[1] + x);
+  s5[2] = vld1q_u16(sum5[2] + x);
+  s5[4] = s5[3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  s3[0] = vld1q_u16(sum3[0] + x);
+  s3[1] = vld1q_u16(sum3[1] + x);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    uint16_t* ma565, uint32_t* b565) {
+  uint8x8x2_t s[2], mas;
+  uint16x8x2_t sq[2], bs;
+  s[0].val[0] = vld1_u8(src0);
+  s[1].val[0] = vld1_u8(src1);
+  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
+  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
+  BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
+                       &mas.val[0], &bs.val[0]);
+
+  int x = 0;
+  do {
+    s[0].val[0] = s[0].val[1];
+    s[1].val[0] = s[1].val[1];
+    sq[0].val[0] = sq[0].val[1];
+    sq[1].val[0] = sq[1].val[1];
+    BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
+                         &mas.val[1], &bs.val[1]);
+    const uint16x8_t ma = Sum565(mas);
+    const uint32x4x2_t b = Sum565W(bs);
+    vst1q_u16(ma565, ma);
+    vst1q_u32(b565 + 0, b.val[0]);
+    vst1q_u32(b565 + 4, b.val[1]);
+    mas.val[0] = mas.val[1];
+    bs.val[0] = bs.val[1];
+    ma565 += 8;
+    b565 += 8;
+    x += 8;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
+    uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
+  uint8x8x2_t s, mas;
+  uint16x8x2_t sq, bs;
+  s.val[0] = vld1_u8(src);
+  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+  BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
+                       &bs.val[0]);
+
+  int x = 0;
+  do {
+    s.val[0] = s.val[1];
+    sq.val[0] = sq.val[1];
+    BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq,
+                         &mas.val[1], &bs.val[1]);
+    if (calculate444) {
+      Store343_444(mas, bs, 0, ma343, ma444, b343, b444);
+      ma444 += 8;
+      b444 += 8;
+    } else {
+      const uint16x8_t ma = Sum343(mas);
+      const uint32x4x2_t b = Sum343W(bs);
+      vst1q_u16(ma343, ma);
+      vst1q_u32(b343 + 0, b.val[0]);
+      vst1q_u32(b343 + 4, b.val[1]);
+    }
+    mas.val[0] = mas.val[1];
+    bs.val[0] = bs.val[1];
+    ma343 += 8;
+    b343 += 8;
+    x += 8;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565,
+    uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) {
+  uint8x8x2_t s[2];
+  uint8x8x2_t ma3[2], ma5;
+  uint16x8x2_t sq[2], b3[2], b5;
+  s[0].val[0] = vld1_u8(src0);
+  s[1].val[0] = vld1_u8(src1);
+  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
+  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
+  BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
+                      square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
+                      &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+
+  int x = 0;
+  do {
+    s[0].val[0] = s[0].val[1];
+    s[1].val[0] = s[1].val[1];
+    sq[0].val[0] = sq[0].val[1];
+    sq[1].val[0] = sq[1].val[1];
+    BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
+                        square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
+                        &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
+    uint16x8_t ma = Sum343(ma3[0]);
+    uint32x4x2_t b = Sum343W(b3[0]);
+    vst1q_u16(ma343[0] + x, ma);
+    vst1q_u32(b343[0] + x, b.val[0]);
+    vst1q_u32(b343[0] + x + 4, b.val[1]);
+    Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
+    ma = Sum565(ma5);
+    b = Sum565W(b5);
+    vst1q_u16(ma565, ma);
+    vst1q_u32(b565 + 0, b.val[0]);
+    vst1q_u32(b565 + 4, b.val[1]);
+    ma3[0].val[0] = ma3[0].val[1];
+    ma3[1].val[0] = ma3[1].val[1];
+    b3[0].val[0] = b3[0].val[1];
+    b3[1].val[0] = b3[1].val[1];
+    ma5.val[0] = ma5.val[1];
+    b5.val[0] = b5.val[1];
+    ma565 += 8;
+    b565 += 8;
+    x += 8;
+  } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t ma,
+                              const uint32x4_t b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const int32x4_t v = vreinterpretq_s32_u32(vmlsl_u16(b, ma, src));
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return vrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint8x8_t src,
+                                         const uint16x8_t ma,
+                                         const uint32x4x2_t b) {
+  const uint16x8_t src_u16 = vmovl_u8(src);
+  const int16x4_t dst_lo =
+      FilterOutput<shift>(vget_low_u16(src_u16), vget_low_u16(ma), b.val[0]);
+  const int16x4_t dst_hi =
+      FilterOutput<shift>(vget_high_u16(src_u16), vget_high_u16(ma), b.val[1]);
+  return vcombine_s16(dst_lo, dst_hi);  // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint8x8_t s,
+                                              uint16x8_t ma[2],
+                                              uint32x4x2_t b[2]) {
+  const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+  uint32x4x2_t b_sum;
+  b_sum.val[0] = vaddq_u32(b[0].val[0], b[1].val[0]);
+  b_sum.val[1] = vaddq_u32(b[0].val[1], b[1].val[1]);
+  return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
+                                              uint16x8_t ma[3],
+                                              uint32x4x2_t b[3]) {
+  const uint16x8_t ma_sum = Sum3_16(ma);
+  const uint32x4x2_t b_sum = Sum3_32(b);
+  return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2],
+                            uint8_t* const dst) {
+  const int16x4_t v_lo =
+      vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x4_t v_hi =
+      vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+  const int16x8_t s = ZeroExtend(src);
+  const int16x8_t d = vaddq_s16(s, vv);
+  vst1_u8(dst, vqmovun_s16(d));
+}
+
+inline void SelfGuidedDoubleMultiplier(const uint8x8_t src,
+                                       const int16x8_t filter[2], const int w0,
+                                       const int w2, uint8_t* const dst) {
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+  v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+  v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+  SelfGuidedFinal(src, v, dst);
+}
+
+inline void SelfGuidedSingleMultiplier(const uint8x8_t src,
+                                       const int16x8_t filter, const int w0,
+                                       uint8_t* const dst) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+  SelfGuidedFinal(src, v, dst);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const uint32_t scale,
+    const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
+    uint8_t* const dst) {
+  uint8x8x2_t s[2], mas;
+  uint16x8x2_t sq[2], bs;
+  s[0].val[0] = vld1_u8(src0);
+  s[1].val[0] = vld1_u8(src1);
+  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
+  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
+  BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
+                       &mas.val[0], &bs.val[0]);
+
+  int x = 0;
+  do {
+    s[0].val[0] = s[0].val[1];
+    s[1].val[0] = s[1].val[1];
+    sq[0].val[0] = sq[0].val[1];
+    sq[1].val[0] = sq[1].val[1];
+    BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
+                         &mas.val[1], &bs.val[1]);
+    uint16x8_t ma[2];
+    uint32x4x2_t b[2];
+    ma[1] = Sum565(mas);
+    b[1] = Sum565W(bs);
+    vst1q_u16(ma565[1] + x, ma[1]);
+    vst1q_u32(b565[1] + x + 0, b[1].val[0]);
+    vst1q_u32(b565[1] + x + 4, b[1].val[1]);
+    const uint8x8_t sr0 = vld1_u8(src + x);
+    const uint8x8_t sr1 = vld1_u8(src + stride + x);
+    int16x8_t p0, p1;
+    ma[0] = vld1q_u16(ma565[0] + x);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 0);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 4);
+    p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]);
+    SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x);
+    SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x);
+    mas.val[0] = mas.val[1];
+    bs.val[0] = bs.val[1];
+    x += 8;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(const uint8_t* const src,
+                                  const uint8_t* const src0, const int width,
+                                  const uint32_t scale, const int16_t w0,
+                                  uint16_t* const sum5[5],
+                                  uint32_t* const square_sum5[5],
+                                  uint16_t* ma565, uint32_t* b565,
+                                  uint8_t* const dst) {
+  uint8x8x2_t s, mas;
+  uint16x8x2_t sq, bs;
+  s.val[0] = vld1_u8(src0);
+  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+  BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq,
+                              &mas.val[0], &bs.val[0]);
+
+  int x = 0;
+  do {
+    s.val[0] = s.val[1];
+    sq.val[0] = sq.val[1];
+    BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq,
+                                &mas.val[1], &bs.val[1]);
+    uint16x8_t ma[2];
+    uint32x4x2_t b[2];
+    ma[1] = Sum565(mas);
+    b[1] = Sum565W(bs);
+    mas.val[0] = mas.val[1];
+    bs.val[0] = bs.val[1];
+    ma[0] = vld1q_u16(ma565);
+    b[0].val[0] = vld1q_u32(b565 + 0);
+    b[0].val[1] = vld1q_u32(b565 + 4);
+    const uint8x8_t sr = vld1_u8(src + x);
+    const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b);
+    SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
+    ma565 += 8;
+    b565 += 8;
+    x += 8;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const uint32_t scale, const int16_t w0, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint16_t* const ma343[3],
+    uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
+    uint8_t* const dst) {
+  uint8x8x2_t s, mas;
+  uint16x8x2_t sq, bs;
+  s.val[0] = vld1_u8(src0);
+  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+  BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
+                       &bs.val[0]);
+
+  int x = 0;
+  do {
+    s.val[0] = s.val[1];
+    sq.val[0] = sq.val[1];
+    BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq,
+                         &mas.val[1], &bs.val[1]);
+    uint16x8_t ma[3];
+    uint32x4x2_t b[3];
+    Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+                 b444[1]);
+    const uint8x8_t sr = vld1_u8(src + x);
+    ma[0] = vld1q_u16(ma343[0] + x);
+    ma[1] = vld1q_u16(ma444[0] + x);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 0);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 4);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 0);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 4);
+    const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b);
+    SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
+    mas.val[0] = mas.val[1];
+    bs.val[0] = bs.val[1];
+    x += 8;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444[3],
+    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  uint8x8x2_t s[2], ma3[2], ma5;
+  uint16x8x2_t sq[2], b3[2], b5;
+  s[0].val[0] = vld1_u8(src0);
+  s[1].val[0] = vld1_u8(src1);
+  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
+  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
+  BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
+                      square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
+                      &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+
+  int x = 0;
+  do {
+    s[0].val[0] = s[0].val[1];
+    s[1].val[0] = s[1].val[1];
+    sq[0].val[0] = sq[0].val[1];
+    sq[1].val[0] = sq[1].val[1];
+    BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
+                        square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
+                        &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
+    uint16x8_t ma[3][3];
+    uint32x4x2_t b[3][3];
+    Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+                 ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+                 b343[3], b444[2]);
+    ma[0][1] = Sum565(ma5);
+    b[0][1] = Sum565W(b5);
+    vst1q_u16(ma565[1] + x, ma[0][1]);
+    vst1q_u32(b565[1] + x, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
+    ma3[0].val[0] = ma3[0].val[1];
+    ma3[1].val[0] = ma3[1].val[1];
+    b3[0].val[0] = b3[0].val[1];
+    b3[1].val[0] = b3[1].val[1];
+    ma5.val[0] = ma5.val[1];
+    b5.val[0] = b5.val[1];
+    int16x8_t p[2][2];
+    const uint8x8_t sr0 = vld1_u8(src + x);
+    const uint8x8_t sr1 = vld1_u8(src + stride + x);
+    ma[0][0] = vld1q_u16(ma565[0] + x);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
+    p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x);
+    ma[1][1] = vld1q_u16(ma444[0] + x);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
+    p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
+    p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]);
+    SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x);
+    SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x);
+    x += 8;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444[3],
+    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  uint8x8x2_t s, ma3, ma5;
+  uint16x8x2_t sq, b3, b5;
+  uint16x8_t ma[3];
+  uint32x4x2_t b[3];
+  s.val[0] = vld1_u8(src0);
+  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+  BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3,
+                             square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0],
+                             &b3.val[0], &b5.val[0]);
+
+  int x = 0;
+  do {
+    s.val[0] = s.val[1];
+    sq.val[0] = sq.val[1];
+    BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
+                               square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1],
+                               &b3.val[1], &b5.val[1]);
+    ma[1] = Sum565(ma5);
+    b[1] = Sum565W(b5);
+    ma5.val[0] = ma5.val[1];
+    b5.val[0] = b5.val[1];
+    ma[2] = Sum343(ma3);
+    b[2] = Sum343W(b3);
+    ma3.val[0] = ma3.val[1];
+    b3.val[0] = b3.val[1];
+    const uint8x8_t sr = vld1_u8(src + x);
+    int16x8_t p[2];
+    ma[0] = vld1q_u16(ma565[0] + x);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 0);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 4);
+    p[0] = CalculateFilteredOutputPass1(sr, ma, b);
+    ma[0] = vld1q_u16(ma343[0] + x);
+    ma[1] = vld1q_u16(ma444[0] + x);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 0);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 4);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 0);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 4);
+    p[1] = CalculateFilteredOutputPass2(sr, ma, b);
+    SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x);
+    x += 8;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const uint8_t* const top_border, const uint8_t* bottom_border,
+    const ptrdiff_t stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0],
+         square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, ma343, ma444, ma565[0], b343, b444,
+                         b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, ma343,
+              ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, ma343, ma444, ma565, b343, b444, b565,
+              dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2,
+                     sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565,
+                     b343, b444, b565, dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src,
+                                  const uint8_t* const top_border,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t stride, const int width,
+                                  const int height, SgrBuffer* const sgr_buffer,
+                                  uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, ma565[0],
+                          b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0,
+                          sum5, square_sum5, ma565[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src,
+                                  const uint8_t* const top_border,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t stride, const int width,
+                                  const int height, SgrBuffer* const sgr_buffer,
+                                  uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
+                                 nullptr, b343[0], nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
+                                ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  src += 2;
+  int y = std::min(height, 2);
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, square_sum3,
+                   ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const void* const top_border, const void* const bottom_border,
+    const ptrdiff_t stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
+                          stride, width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
+                          stride, width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
+                     width, height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->loop_restorations[0] = WienerFilter_NEON;
+  dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/loop_restoration_neon.h b/src/dsp/arm/loop_restoration_neon.h
new file mode 100644
index 0000000..b551610
--- /dev/null
+++ b/src/dsp/arm/loop_restoration_neon.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// This function is not thread-safe.
+void LoopRestorationInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
new file mode 100644
index 0000000..084f42f
--- /dev/null
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -0,0 +1,444 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// TODO(b/150461164): Consider combining with GetInterIntraMask4x2().
+// Compound predictors use int16_t values and need to multiply long because the
+// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by
+// int8_t and accumulate into int32_t instruction.
+template <int subsampling_x, int subsampling_y>
+inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask)));
+    const int16x4_t mask_val1 = vreinterpret_s16_u16(
+        vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y))));
+    int16x8_t final_val;
+    if (subsampling_y == 1) {
+      const int16x4_t next_mask_val0 =
+          vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride)));
+      const int16x4_t next_mask_val1 =
+          vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3)));
+      final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1),
+                            vcombine_s16(next_mask_val0, next_mask_val1));
+    } else {
+      final_val = vreinterpretq_s16_u16(
+          vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1))));
+    }
+    return vrshrq_n_s16(final_val, subsampling_y + 1);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val0 = Load4(mask);
+  const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
+  return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask)));
+    if (subsampling_y == 1) {
+      const int16x8_t next_mask_val =
+          vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride)));
+      mask_val = vaddq_s16(mask_val, next_mask_val);
+    }
+    return vrshrq_n_s16(mask_val, 1 + subsampling_y);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val = vld1_u8(mask);
+  return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
+                                  const int16_t* const pred_1,
+                                  const int16x8_t pred_mask_0,
+                                  const int16x8_t pred_mask_1, uint8_t* dst,
+                                  const ptrdiff_t dst_stride) {
+  const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+  const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+  // int res = (mask_value * prediction_0[x] +
+  //      (64 - mask_value) * prediction_1[x]) >> 6;
+  const int32x4_t weighted_pred_0_lo =
+      vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+  const int32x4_t weighted_pred_0_hi =
+      vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+  const int32x4_t weighted_combo_lo = vmlal_s16(
+      weighted_pred_0_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+  const int32x4_t weighted_combo_hi =
+      vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+                vget_high_s16(pred_val_1));
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //         (1 << kBitdepth8) - 1));
+  const uint8x8_t result =
+      vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+                                  vshrn_n_s32(weighted_combo_hi, 6)),
+                     4);
+  StoreLo4(dst, result);
+  StoreHi4(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_NEON(const int16_t* pred_0, const int16_t* pred_1,
+                                 const uint8_t* mask,
+                                 const ptrdiff_t mask_stride, uint8_t* dst,
+                                 const ptrdiff_t dst_stride) {
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int16x8_t pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+  // TODO(b/150461164): Arm tends to do better with load(val); val += stride
+  // It may be possible to turn this into a loop with a templated height.
+  pred_0 += 4 << 1;
+  pred_1 += 4 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_NEON(const int16_t* pred_0, const int16_t* pred_1,
+                                 const uint8_t* const mask_ptr,
+                                 const ptrdiff_t mask_stride, const int height,
+                                 uint8_t* dst, const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlending4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int y = 0;
+  do {
+    int16x8_t pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_NEON(const void* prediction_0, const void* prediction_1,
+                           const ptrdiff_t /*prediction_stride_1*/,
+                           const uint8_t* const mask_ptr,
+                           const ptrdiff_t mask_stride, const int width,
+                           const int height, void* dest,
+                           const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  if (width == 4) {
+    MaskBlending4xH_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+      const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x);
+      const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x);
+      uint8x8_t result;
+      // int res = (mask_value * prediction_0[x] +
+      //      (64 - mask_value) * prediction_1[x]) >> 6;
+      const int32x4_t weighted_pred_0_lo =
+          vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+      const int32x4_t weighted_pred_0_hi =
+          vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+      const int32x4_t weighted_combo_lo =
+          vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1),
+                    vget_low_s16(pred_val_1));
+      const int32x4_t weighted_combo_hi =
+          vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+                    vget_high_s16(pred_val_1));
+
+      // dst[x] = static_cast<Pixel>(
+      //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+      //           (1 << kBitdepth8) - 1));
+      result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+                                           vshrn_n_s32(weighted_combo_hi, 6)),
+                              4);
+      vst1_u8(dst + x, result);
+
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+// TODO(b/150461164): This is much faster for inter_intra (input is Pixel
+// values) but regresses compound versions (input is int16_t). Try to
+// consolidate these.
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask,
+                                      ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const uint8x8_t mask_val =
+        vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y)));
+    if (subsampling_y == 1) {
+      const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride),
+                                               vld1_u8(mask + mask_stride * 3));
+
+      // Use a saturating add to work around the case where all |mask| values
+      // are 64. Together with the rounding shift this ensures the correct
+      // result.
+      const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val);
+      return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
+    }
+
+    return vrshr_n_u8(mask_val, /*subsampling_x=*/1);
+  }
+
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val0 = Load4(mask);
+  // TODO(b/150461164): Investigate the source of |mask| and see if the stride
+  // can be removed.
+  // TODO(b/150461164): The unit tests start at 8x8. Does this get run?
+  return Load4<1>(mask + mask_stride, mask_val0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetInterIntraMask8(const uint8_t* mask,
+                                    ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const uint8x16_t mask_val = vld1q_u8(mask);
+    const uint8x8_t mask_paired =
+        vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val));
+    if (subsampling_y == 1) {
+      const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride);
+      const uint8x8_t next_mask_paired =
+          vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val));
+
+      // Use a saturating add to work around the case where all |mask| values
+      // are 64. Together with the rounding shift this ensures the correct
+      // result.
+      const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired);
+      return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
+    }
+
+    return vrshr_n_u8(mask_paired, /*subsampling_x=*/1);
+  }
+
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  return vld1_u8(mask);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
+                                                uint8_t* const pred_1,
+                                                const ptrdiff_t pred_stride_1,
+                                                const uint8x8_t pred_mask_0,
+                                                const uint8x8_t pred_mask_1) {
+  const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+  uint8x8_t pred_val_1 = Load4(pred_1);
+  pred_val_1 = Load4<1>(pred_1 + pred_stride_1, pred_val_1);
+
+  const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+  const uint16x8_t weighted_combo =
+      vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+  const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+  StoreLo4(pred_1, result);
+  StoreHi4(pred_1 + pred_stride_1, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_NEON(const uint8_t* pred_0,
+                                               uint8_t* pred_1,
+                                               const ptrdiff_t pred_stride_1,
+                                               const uint8_t* mask,
+                                               const ptrdiff_t mask_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  uint8x8_t pred_mask_1 =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+
+  pred_mask_1 =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_NEON(
+    const uint8_t* pred_0, uint8_t* pred_1, const ptrdiff_t pred_stride_1,
+    const uint8_t* mask, const ptrdiff_t mask_stride, const int height) {
+  if (height == 4) {
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    return;
+  }
+  int y = 0;
+  do {
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend8bpp_NEON(const uint8_t* prediction_0,
+                                         uint8_t* prediction_1,
+                                         const ptrdiff_t prediction_stride_1,
+                                         const uint8_t* const mask_ptr,
+                                         const ptrdiff_t mask_stride,
+                                         const int width, const int height) {
+  if (width == 4) {
+    InterIntraMaskBlending8bpp4xH_NEON<subsampling_x, subsampling_y>(
+        prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+        height);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // TODO(b/150461164): Consider a 16 wide specialization (at least for the
+      // unsampled version) to take advantage of vld1q_u8().
+      const uint8x8_t pred_mask_1 =
+          GetInterIntraMask8<subsampling_x, subsampling_y>(
+              mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+      const uint8x8_t pred_val_0 = vld1_u8(prediction_0);
+      prediction_0 += 8;
+      const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x);
+      const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+      // weighted_pred0 + weighted_pred1
+      const uint16x8_t weighted_combo =
+          vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+      const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+      vst1_u8(prediction_1 + x, result);
+
+      x += 8;
+    } while (x < width);
+    prediction_1 += prediction_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1>;
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_NEON<0, 0>;
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_NEON<1, 0>;
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_NEON<1, 1>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/mask_blend_neon.h b/src/dsp/arm/mask_blend_neon.h
new file mode 100644
index 0000000..3829274
--- /dev/null
+++ b/src/dsp/arm/mask_blend_neon.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc
new file mode 100644
index 0000000..8caba7d
--- /dev/null
+++ b/src/dsp/arm/motion_field_projection_neon.cc
@@ -0,0 +1,393 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x8_t LoadDivision(const int8x8x2_t division_table,
+                              const int8x8_t reference_offset) {
+  const int8x8_t kOne = vcreate_s8(0x0100010001000100);
+  const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
+  const int8x8_t t = vadd_s8(reference_offset, reference_offset);
+  const int8x8x2_t tt = vzip_s8(t, t);
+  const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
+  const int8x16_t idx = vaddq_s8(t1, kOneQ);
+  const int8x8_t idx_low = vget_low_s8(idx);
+  const int8x8_t idx_high = vget_high_s8(idx);
+  const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
+  const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
+  return vcombine_s16(d0, d1);
+}
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+                              const int numerator) {
+  const int32x4_t m0 = vmull_s16(mv, denominator);
+  const int32x4_t m = vmulq_n_s32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+  return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x8_t MvProjectionClip(const int16x8_t mv,
+                                  const int16x8_t denominator,
+                                  const int numerator) {
+  const int16x4_t mv0 = vget_low_s16(mv);
+  const int16x4_t mv1 = vget_high_s16(mv);
+  const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
+  const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
+  const int16x8_t projection = vcombine_s16(s0, s1);
+  const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+  const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
+  return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
+  // Add 63 to negative delta so that it shifts towards zero.
+  const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
+  const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
+  const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
+  const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
+  const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
+  const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
+  const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
+  const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
+  return vqmovn_s16(offset2);
+}
+
+inline void GetPosition(
+    const int8x8x2_t division_table, const MotionVector* const mv,
+    const int numerator, const int x8_start, const int x8_end, const int x8,
+    const int8x8_t r_offsets, const int8x8_t source_reference_type8,
+    const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
+    const int16x8_t d_sign, const int delta, int8x8_t* const r,
+    int8x8_t* const position_y8, int8x8_t* const position_x8,
+    int64_t* const skip_64, int32x4_t mvs[2]) {
+  const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+  *r = vtbl1_s8(r_offsets, source_reference_type8);
+  const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
+  int16x8_t projection_mv[2];
+  mvs[0] = vld1q_s32(mv_int + 0);
+  mvs[1] = vld1q_s32(mv_int + 4);
+  // Deinterlace x and y components
+  const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
+  const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
+  const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
+  // numerator could be 0.
+  projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
+  projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
+  // Do not update the motion vector if the block position is not valid or
+  // if position_x8 is outside the current range of x8_start and x8_end.
+  // Note that position_y8 will always be within the range of y8_start and
+  // y8_end.
+  // After subtracting the base, valid projections are within 8-bit.
+  *position_y8 = Project_NEON(projection_mv[0], d_sign);
+  const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign);
+  const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100});
+  *position_x8 = vqadd_s8(position_x, k01234567);
+  const int8x16_t position_xy = vcombine_s8(*position_x8, *position_y8);
+  const int x8_floor = std::max(
+      x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
+  const int x8_ceiling = std::min(
+      x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset);  // [0, 16]
+  const int8x8_t x8_floor8 = vdup_n_s8(x8_floor);
+  const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling);
+  const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8);
+  const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8);
+  const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy);
+  const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy);
+  const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow));
+  const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out));
+  const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out));
+  *skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0);
+}
+
+template <int idx>
+inline void Store(const int16x8_t position, const int8x8_t reference_offset,
+                  const int32x4_t mv, int8_t* dst_reference_offset,
+                  MotionVector* dst_mv) {
+  const ptrdiff_t offset = vgetq_lane_s16(position, idx);
+  auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
+  vst1q_lane_s32(d_mv, mv, idx & 3);
+  vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const int16x8_t position,
+                       const int8x8_t reference_offset, const int32x4_t mv,
+                       int8_t* dst_reference_offset, MotionVector* dst_mv) {
+  if (skips[idx] == 0) {
+    Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+  }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
+                                      const int reference_to_current_with_sign,
+                                      const int dst_sign, const int y8_start,
+                                      const int y8_end, const int x8_start,
+                                      const int x8_end,
+                                      TemporalMotionField* const motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+  const int leftover = adjusted_x8_end - adjusted_x8_end8;
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  const int16x8_t d_sign = vdupq_n_s16(dst_sign);
+
+  static_assert(sizeof(int8_t) == sizeof(bool), "");
+  static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+  static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+  assert(dst_sign == 0 || dst_sign == -1);
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+  assert((adjusted_x8_start & 7) == 0);
+  // The final position calculation is represented with int16_t. Valid
+  // position_y8 from its base is at most 7. After considering the horizontal
+  // offset which is at most |stride - 1|, we have the following assertion,
+  // which means this optimization works for frame width up to 32K (each
+  // position is a 8x8 block).
+  assert(8 * stride <= 32768);
+  const int8x8_t skip_reference =
+      vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
+  const int8x8_t r_offsets = vld1_s8(reference_offsets);
+  const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
+  int8x8x2_t division_table;
+  division_table.val[0] = vget_low_s8(table);
+  division_table.val[1] = vget_high_s8(table);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;                         // [-7, 0]
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);  // [1, 8]
+    const int8x8_t y8_floor8 = vdup_n_s8(y8_floor);
+    const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling);
+    int x8;
+
+    for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+      const int8x8_t source_reference_type8 =
+          vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
+      const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
+      const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+      // Early termination #1 if all are skips. Chance is typically ~30-40%.
+      if (early_skip == -1) continue;
+      int64_t skip_64;
+      int8x8_t r, position_x8, position_y8;
+      int32x4_t mvs[2];
+      GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+                  x8_end, x8, r_offsets, source_reference_type8, skip_r,
+                  y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8,
+                  &position_x8, &skip_64, mvs);
+      // Early termination #2 if all are skips.
+      // Chance is typically ~15-25% after Early termination #1.
+      if (skip_64 == -1) continue;
+      const int16x8_t p_y = vmovl_s8(position_y8);
+      const int16x8_t p_x = vmovl_s8(position_x8);
+      const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+      const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+      if (skip_64 == 0) {
+        // Store all. Chance is typically ~70-85% after Early termination #2.
+        Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+      } else {
+        // Check and store each.
+        // Chance is typically ~15-30% after Early termination #2.
+        // The compiler is smart enough to not create the local buffer skips[].
+        int8_t skips[8];
+        memcpy(skips, &skip_64, sizeof(skips));
+        CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+      }
+    }
+
+    // The following leftover processing cannot be moved out of the do...while
+    // loop. Doing so may change the result storing orders of the same position.
+    if (leftover > 0) {
+      // Use SIMD only when leftover is at least 4, and there are at least 8
+      // elements in a row.
+      if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+        // Process the last 8 elements to avoid loading invalid memory. Some
+        // elements may have been processed in the above loop, which is OK.
+        const int delta = 8 - leftover;
+        x8 = adjusted_x8_end - 8;
+        const int8x8_t source_reference_type8 = vld1_s8(
+            reinterpret_cast<const int8_t*>(source_reference_types + x8));
+        const int8x8_t skip_r =
+            vtbl1_s8(skip_reference, source_reference_type8);
+        const int64_t early_skip =
+            vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+        // Early termination #1 if all are skips.
+        if (early_skip != -1) {
+          int64_t skip_64;
+          int8x8_t r, position_x8, position_y8;
+          int32x4_t mvs[2];
+          GetPosition(division_table, mv, reference_to_current_with_sign,
+                      x8_start, x8_end, x8, r_offsets, source_reference_type8,
+                      skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+                      &position_y8, &position_x8, &skip_64, mvs);
+          // Early termination #2 if all are skips.
+          if (skip_64 != -1) {
+            const int16x8_t p_y = vmovl_s8(position_y8);
+            const int16x8_t p_x = vmovl_s8(position_x8);
+            const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+            const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+            // Store up to 7 elements since leftover is at most 7.
+            if (skip_64 == 0) {
+              // Store all.
+              Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+            } else {
+              // Check and store each.
+              // The compiler is smart enough to not create the local buffer
+              // skips[].
+              int8_t skips[8];
+              memcpy(skips, &skip_64, sizeof(skips));
+              CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+            }
+          }
+        }
+      } else {
+        for (; x8 < adjusted_x8_end; ++x8) {
+          const int source_reference_type = source_reference_types[x8];
+          if (skip_references[source_reference_type]) continue;
+          MotionVector projection_mv;
+          // reference_to_current_with_sign could be 0.
+          GetMvProjection(mv[x8], reference_to_current_with_sign,
+                          projection_divisions[source_reference_type],
+                          &projection_mv);
+          // Do not update the motion vector if the block position is not valid
+          // or if position_x8 is outside the current range of x8_start and
+          // x8_end. Note that position_y8 will always be within the range of
+          // y8_start and y8_end.
+          const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+          if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+          const int x8_base = x8 & ~7;
+          const int x8_floor =
+              std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+          const int x8_ceiling =
+              std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+          const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+          if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+          dst_mv[position_y8 * stride + position_x8] = mv[x8];
+          dst_reference_offset[position_y8 * stride + position_x8] =
+              reference_offsets[source_reference_type];
+        }
+      }
+    }
+
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
+}
+#endif
+
+}  // namespace
+
+void MotionFieldProjectionInit_NEON() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/motion_field_projection_neon.h b/src/dsp/arm/motion_field_projection_neon.h
new file mode 100644
index 0000000..41ab6a6
--- /dev/null
+++ b/src/dsp/arm/motion_field_projection_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
new file mode 100644
index 0000000..8a403a6
--- /dev/null
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -0,0 +1,267 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+                              const int32x4_t numerator) {
+  const int32x4_t m0 = vmull_s16(mv, denominator);
+  const int32x4_t m = vmulq_s32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+  return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x4_t MvProjectionCompound(const int16x4_t mv,
+                                      const int temporal_reference_offsets,
+                                      const int reference_offsets[2]) {
+  const int16x4_t denominator =
+      vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
+  const int32x2_t offset = vld1_s32(reference_offsets);
+  const int32x2x2_t offsets = vzip_s32(offset, offset);
+  const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
+  return MvProjection(mv, denominator, numerator);
+}
+
+inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
+  const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+  const int16x8_t mv = vcombine_s16(mv0, mv1);
+  const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
+  return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int16x8_t MvProjectionCompoundClip(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets,
+    const int reference_offsets[2]) {
+  const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+  const int32x2_t temporal_mv = vld1_s32(tmvs);
+  const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
+  const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
+  const int16x4_t mv0 = MvProjectionCompound(
+      tmv0, temporal_reference_offsets[0], reference_offsets);
+  const int16x4_t mv1 = MvProjectionCompound(
+      tmv1, temporal_reference_offsets[1], reference_offsets);
+  return ProjectionClip(mv0, mv1);
+}
+
+inline int16x8_t MvProjectionSingleClip(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets, const int reference_offset,
+    int16x4_t* const lookup) {
+  const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+  const int16x8_t temporal_mv = vld1q_s16(tmvs);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
+  const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
+  const int16x4_t tmv0 = vget_low_s16(temporal_mv);
+  const int16x4_t tmv1 = vget_high_s16(temporal_mv);
+  const int32x4_t numerator = vdupq_n_s32(reference_offset);
+  const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
+  const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
+  return ProjectionClip(mv0, mv1);
+}
+
+inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
+  const int16x8_t kRoundDownMask = vdupq_n_s16(1);
+  const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+  const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+  const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
+  vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
+}
+
+inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
+  const int16x8_t kRoundDownMask = vdupq_n_s16(7);
+  const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+  const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+  const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
+  const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
+  vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
+}
+
+void MvProjectionCompoundLowPrecision_NEON(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    LowPrecision(mv, candidate_mvs);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count);
+}
+
+void MvProjectionCompoundForceInteger_NEON(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    ForceInteger(mv, candidate_mvs);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count);
+}
+
+void MvProjectionCompoundHighPrecision_NEON(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count);
+}
+
+void MvProjectionSingleLowPrecision_NEON(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    LowPrecision(mv, candidate_mvs);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count);
+}
+
+void MvProjectionSingleForceInteger_NEON(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    ForceInteger(mv, candidate_mvs);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count);
+}
+
+void MvProjectionSingleHighPrecision_NEON(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+#endif
+
+}  // namespace
+
+void MotionVectorSearchInit_NEON() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/motion_vector_search_neon.h b/src/dsp/arm/motion_vector_search_neon.h
new file mode 100644
index 0000000..19b4519
--- /dev/null
+++ b/src/dsp/arm/motion_vector_search_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
new file mode 100644
index 0000000..66ad663
--- /dev/null
+++ b/src/dsp/arm/obmc_neon.cc
@@ -0,0 +1,392 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred,
+                           const uint8x8_t pred_mask,
+                           const uint8x8_t obmc_pred_mask) {
+  const uint8x8_t pred_val = Load4(pred);
+  const uint8x8_t obmc_pred_val = Load4(obmc_pred);
+  const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+  const uint8x8_t result =
+      vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  StoreLo4(pred, result);
+}
+
+template <bool from_left>
+inline void OverlapBlend2xH_NEON(uint8_t* const prediction,
+                                 const ptrdiff_t prediction_stride,
+                                 const int height,
+                                 const uint8_t* const obmc_prediction,
+                                 const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8_t* obmc_pred = obmc_prediction;
+  uint8x8_t pred_mask;
+  uint8x8_t obmc_pred_mask;
+  int compute_height;
+  const int mask_offset = height - 2;
+  if (from_left) {
+    pred_mask = Load2(kObmcMask);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    compute_height = height;
+  } else {
+    // Weights for the last line are all 64, which is a no-op.
+    compute_height = height - 1;
+  }
+  uint8x8_t pred_val = vdup_n_u8(0);
+  uint8x8_t obmc_pred_val = vdup_n_u8(0);
+  int y = 0;
+  do {
+    if (!from_left) {
+      pred_mask = vdup_n_u8(kObmcMask[mask_offset + y]);
+      obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    }
+    pred_val = Load2<0>(pred, pred_val);
+    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+    obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val);
+    const uint8x8_t result =
+        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+    Store2<0>(pred, result);
+
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y != compute_height);
+}
+
+inline void OverlapBlendFromLeft4xH_NEON(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x8_t pred_mask = Load4(kObmcMask + 2);
+  // 64 - mask
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    y += 2;
+  } while (y != height);
+}
+
+inline void OverlapBlendFromLeft8xH_NEON(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
+  // 64 - mask
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    const uint8x8_t pred_val = vld1_u8(pred);
+    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+    const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
+    const uint8x8_t result =
+        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+
+    vst1_u8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y != height);
+}
+
+void OverlapBlendFromLeft_NEON(void* const prediction,
+                               const ptrdiff_t prediction_stride,
+                               const int width, const int height,
+                               const void* const obmc_prediction,
+                               const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+  if (width == 2) {
+    OverlapBlend2xH_NEON<true>(pred, prediction_stride, height, obmc_pred,
+                               obmc_prediction_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  const uint8x16_t mask_inverter = vdupq_n_u8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint8_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+    const uint8x16_t pred_mask = vld1q_u8(mask + x);
+    // 64 - mask
+    const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
+    int y = 0;
+    do {
+      const uint8x16_t pred_val = vld1q_u8(pred);
+      const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+      const uint16x8_t weighted_pred_lo =
+          vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
+      const uint8x8_t result_lo =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
+                                vget_low_u8(obmc_pred_val)),
+                       6);
+      const uint16x8_t weighted_pred_hi =
+          vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
+      const uint8x8_t result_hi =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
+                                vget_high_u8(obmc_pred_val)),
+                       6);
+      vst1q_u8(pred, vcombine_u8(result_lo, result_hi));
+
+      pred += prediction_stride;
+      obmc_pred += obmc_prediction_stride;
+    } while (++y < height);
+    x += 16;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction,
+                                        const ptrdiff_t prediction_stride,
+                                        const uint8_t* const obmc_prediction,
+                                        const ptrdiff_t obmc_prediction_stride,
+                                        const int height) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  if (height == 2) {
+    return;
+  }
+
+  pred_mask = vdup_n_u8(kObmcMask[3]);
+  obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  pred_mask = vdup_n_u8(kObmcMask[4]);
+  obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+}
+
+inline void OverlapBlendFromTop4xH_NEON(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  if (height < 8) {
+    OverlapBlendFromTop4x4_NEON(prediction, prediction_stride, obmc_prediction,
+                                obmc_prediction_stride, height);
+    return;
+  }
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const uint8_t* mask = kObmcMask + height - 2;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  int y = 0;
+  // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
+  // lines are unchanged as the corresponding mask value is 64.
+  do {
+    uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 1]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 2]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 3]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 4]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 5]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    // Increment for the right mask index.
+    y += 6;
+  } while (y < height - 4);
+}
+
+inline void OverlapBlendFromTop8xH_NEON(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    // 64 - mask
+    const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    const uint8x8_t pred_val = vld1_u8(pred);
+    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+    const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
+    const uint8x8_t result =
+        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+
+    vst1_u8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y != compute_height);
+}
+
+void OverlapBlendFromTop_NEON(void* const prediction,
+                              const ptrdiff_t prediction_stride,
+                              const int width, const int height,
+                              const void* const obmc_prediction,
+                              const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+  if (width == 2) {
+    OverlapBlend2xH_NEON<false>(pred, prediction_stride, height, obmc_pred,
+                                obmc_prediction_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                obmc_prediction_stride);
+    return;
+  }
+
+  if (width == 8) {
+    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                obmc_prediction_stride);
+    return;
+  }
+
+  const uint8_t* mask = kObmcMask + height - 2;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  // Stop when mask value becomes 64. This is inferred for 4xH.
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    // 64 - mask
+    const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    int x = 0;
+    do {
+      const uint8x16_t pred_val = vld1q_u8(pred + x);
+      const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
+      const uint16x8_t weighted_pred_lo =
+          vmull_u8(pred_mask, vget_low_u8(pred_val));
+      const uint8x8_t result_lo =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
+                                vget_low_u8(obmc_pred_val)),
+                       6);
+      const uint16x8_t weighted_pred_hi =
+          vmull_u8(pred_mask, vget_high_u8(pred_val));
+      const uint8x8_t result_hi =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
+                                vget_high_u8(obmc_pred_val)),
+                       6);
+      vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));
+
+      x += 16;
+    } while (x < width);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y < compute_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
+}
+
+}  // namespace
+
+void ObmcInit_NEON() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/obmc_neon.h b/src/dsp/arm/obmc_neon.h
new file mode 100644
index 0000000..d5c9d9c
--- /dev/null
+++ b/src/dsp/arm/obmc_neon.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If NEON is enabled, signal the NEON implementation should be used.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
new file mode 100644
index 0000000..1680450
--- /dev/null
+++ b/src/dsp/arm/super_res_neon.cc
@@ -0,0 +1,166 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint8x8_t filter[8];
+    uint8x16_t d[kSuperResFilterTaps / 2];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      filter[i] =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
+    }
+    Transpose8x8(filter, d);
+    vst1q_u8(dst, d[0]);
+    dst += 16;
+    vst1q_u8(dst, d[1]);
+    dst += 16;
+    vst1q_u8(dst, d[2]);
+    dst += 16;
+    vst1q_u8(dst, d[3]);
+    dst += 16;
+  } while (--x != 0);
+}
+
+// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
+// Maximum sum: 255*171 == 0xAA55
+// The sum is clipped to [0, 255], so adding all positive and then
+// subtracting all negative with saturation is sufficient.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
+                          const uint8_t** coefficients) {
+  uint8x16_t f[kSuperResFilterTaps / 2];
+  for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
+    f[i] = vld1q_u8(*coefficients);
+  }
+  uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
+  res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
+  res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
+  res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
+  uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
+  temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
+  temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
+  temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
+  res = vqsubq_u16(res, temp);
+  return vqrshrn_n_u16(res, kFilterBits);
+}
+
+void SuperRes_NEON(const void* const coefficients, void* const source,
+                   const ptrdiff_t stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* const dest) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    uint8x8_t sr[8];
+    uint8x16_t s[8];
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    // The below code calculates up to 15 extra upscaled
+    // pixels which will over-read up to 15 downscaled pixels in the end of each
+    // row. kSuperResHorizontalBorder accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+        s[i] = vcombine_u8(sr[i], s_hi);
+      }
+      Transpose8x16(s);
+      // Do not use loop for the following 8 instructions, since the compiler
+      // will generate redundant code.
+      sr[0] = vget_low_u8(s[0]);
+      sr[1] = vget_low_u8(s[1]);
+      sr[2] = vget_low_u8(s[2]);
+      sr[3] = vget_low_u8(s[3]);
+      sr[4] = vget_low_u8(s[4]);
+      sr[5] = vget_low_u8(s[5]);
+      sr[6] = vget_low_u8(s[6]);
+      sr[7] = vget_low_u8(s[7]);
+      const uint8x8_t d0 = SuperRes(sr, &filter);
+      // Do not use loop for the following 8 instructions, since the compiler
+      // will generate redundant code.
+      sr[0] = vget_high_u8(s[0]);
+      sr[1] = vget_high_u8(s[1]);
+      sr[2] = vget_high_u8(s[2]);
+      sr[3] = vget_high_u8(s[3]);
+      sr[4] = vget_high_u8(s[4]);
+      sr[5] = vget_high_u8(s[5]);
+      sr[6] = vget_high_u8(s[6]);
+      sr[7] = vget_high_u8(s[7]);
+      const uint8x8_t d1 = SuperRes(sr, &filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += stride;
+    dst += stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
new file mode 100644
index 0000000..f51785d
--- /dev/null
+++ b/src/dsp/arm/super_res_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
new file mode 100644
index 0000000..7a41998
--- /dev/null
+++ b/src/dsp/arm/warp_neon.cc
@@ -0,0 +1,453 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+    (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+//
+// src_row_centered contains 16 "centered" samples of a source row. (We center
+// the samples by subtracting 128 from the samples.)
+void HorizontalFilter(const int sx4, const int16_t alpha,
+                      const int8x16_t src_row_centered,
+                      int16_t intermediate_result_row[8]) {
+  int sx = sx4 - MultiplyBy4(alpha);
+  int8x8_t filter[8];
+  for (int x = 0; x < 8; ++x) {
+    const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                       kWarpedPixelPrecisionShifts;
+    filter[x] = vld1_s8(kWarpedFilters8[offset]);
+    sx += alpha;
+  }
+  Transpose8x8(filter);
+  // Add kFirstPassOffset to ensure |sum| stays within uint16_t.
+  // Add 128 (offset) * 128 (filter sum) (also 1 << 14) to account for the
+  // centering of the source samples. These combined are 1 << 15 or -32768.
+  int16x8_t sum =
+      vdupq_n_s16(static_cast<int16_t>(kFirstPassOffset + 128 * 128));
+  // Unrolled k = 0..7 loop. We need to manually unroll the loop because the
+  // third argument (an index value) to vextq_s8() must be a constant
+  // (immediate). src_row_window is a sliding window of length 8 into
+  // src_row_centered.
+  // k = 0.
+  int8x8_t src_row_window = vget_low_s8(src_row_centered);
+  sum = vmlal_s8(sum, filter[0], src_row_window);
+  // k = 1.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 1));
+  sum = vmlal_s8(sum, filter[1], src_row_window);
+  // k = 2.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 2));
+  sum = vmlal_s8(sum, filter[2], src_row_window);
+  // k = 3.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 3));
+  sum = vmlal_s8(sum, filter[3], src_row_window);
+  // k = 4.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 4));
+  sum = vmlal_s8(sum, filter[4], src_row_window);
+  // k = 5.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 5));
+  sum = vmlal_s8(sum, filter[5], src_row_window);
+  // k = 6.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 6));
+  sum = vmlal_s8(sum, filter[6], src_row_window);
+  // k = 7.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 7));
+  sum = vmlal_s8(sum, filter[7], src_row_window);
+  // End of unrolled k = 0..7 loop.
+  // Due to the offset |sum| is guaranteed to be unsigned.
+  uint16x8_t sum_unsigned = vreinterpretq_u16_s16(sum);
+  sum_unsigned = vrshrq_n_u16(sum_unsigned, kInterRoundBitsHorizontal);
+  // After the shift |sum_unsigned| will fit into int16_t.
+  vst1q_s16(intermediate_result_row, vreinterpretq_s16_u16(sum_unsigned));
+}
+
+template <bool is_compound>
+void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
+               const int source_width, const int source_height,
+               const int* const warp_params, const int subsampling_x,
+               const int subsampling_y, const int block_start_x,
+               const int block_start_y, const int block_width,
+               const int block_height, const int16_t alpha, const int16_t beta,
+               const int16_t gamma, const int16_t delta, void* dest,
+               const ptrdiff_t dest_stride) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  union {
+    // Intermediate_result is the output of the horizontal filtering and
+    // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+    // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+    // type so that we can multiply it by kWarpedFilters (which has signed
+    // values) using vmlal_s16().
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+
+  const auto* const src = static_cast<const uint8_t*>(source);
+  using DestType =
+      typename std::conditional<is_compound, int16_t, uint8_t>::type;
+  auto* dst = static_cast<DestType*>(dest);
+
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+
+  // Warp process applies for each 8x8 block.
+  int start_y = block_start_y;
+  do {
+    int start_x = block_start_x;
+    do {
+      const int src_x = (start_x + 4) << subsampling_x;
+      const int src_y = (start_y + 4) << subsampling_y;
+      const int dst_x =
+          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+      const int dst_y =
+          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+      const int x4 = dst_x >> subsampling_x;
+      const int y4 = dst_y >> subsampling_y;
+      const int ix4 = x4 >> kWarpedModelPrecisionBits;
+      const int iy4 = y4 >> kWarpedModelPrecisionBits;
+      // A prediction block may fall outside the frame's boundaries. If a
+      // prediction block is calculated using only samples outside the frame's
+      // boundary, the filtering can be simplified. We can divide the plane
+      // into several regions and handle them differently.
+      //
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //         -------+-----------+-------
+      //                |***********|
+      //            2   |*****4*****|   2
+      //                |***********|
+      //         -------+-----------+-------
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //
+      // At the center, region 4 represents the frame and is the general case.
+      //
+      // In regions 1 and 2, the prediction block is outside the frame's
+      // boundary horizontally. Therefore the horizontal filtering can be
+      // simplified. Furthermore, in the region 1 (at the four corners), the
+      // prediction is outside the frame's boundary both horizontally and
+      // vertically, so we get a constant prediction block.
+      //
+      // In region 3, the prediction block is outside the frame's boundary
+      // vertically. Unfortunately because we apply the horizontal filters
+      // first, by the time we apply the vertical filters, they no longer see
+      // simple inputs. So the only simplification is that all the rows are
+      // the same, but we still need to apply all the horizontal and vertical
+      // filters.
+
+      // Check for two simple special cases, where the horizontal filter can
+      // be significantly simplified.
+      //
+      // In general, for each row, the horizontal filter is calculated as
+      // follows:
+      //   for (int x = -4; x < 4; ++x) {
+      //     const int offset = ...;
+      //     int sum = first_pass_offset;
+      //     for (int k = 0; k < 8; ++k) {
+      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+      //       sum += kWarpedFilters[offset][k] * src_row[column];
+      //     }
+      //     ...
+      //   }
+      // The column index before clipping, ix4 + x + k - 3, varies in the range
+      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+      // border index (source_width - 1 or 0, respectively). Then for each x,
+      // the inner for loop of the horizontal filter is reduced to multiplying
+      // the border pixel by the sum of the filter coefficients.
+      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+        // Regions 1 and 2.
+        // Points to the left or right border of the first row of |src|.
+        const uint8_t* first_row_border =
+            (ix4 + 7 <= 0) ? src : src + source_width - 1;
+        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+        // In two special cases, iy4 + y is clipped to either 0 or
+        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+        // bounded and we can avoid clipping iy4 + y by relying on a reference
+        // frame's boundary extension on the top and bottom.
+        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+          // Region 1.
+          // Every sample used to calculate the prediction block has the same
+          // value. So the whole prediction block has the same value.
+          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const uint8_t row_border_pixel =
+              first_row_border[row * source_stride];
+
+          DestType* dst_row = dst + start_x - block_start_x;
+          for (int y = 0; y < 8; ++y) {
+            if (is_compound) {
+              const int16x8_t sum =
+                  vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical -
+                                                   kRoundBitsVertical));
+              vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+            } else {
+              memset(dst_row, row_border_pixel, 8);
+            }
+            dst_row += dest_stride;
+          }
+          // End of region 1. Continue the |start_x| do-while loop.
+          start_x += 8;
+          continue;
+        }
+
+        // Region 2.
+        // Horizontal filter.
+        // The input values in this region are generated by extending the border
+        // which makes them identical in the horizontal direction. This
+        // computation could be inlined in the vertical pass but most
+        // implementations will need a transpose of some sort.
+        // It is not necessary to use the offset values here because the
+        // horizontal pass is a simple shift and the vertical pass will always
+        // require using 32 bits.
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = iy4 + y;
+          int sum = first_row_border[row * source_stride];
+          sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+          intermediate_result_column[y + 7] = sum;
+        }
+        // Vertical filter.
+        DestType* dst_row = dst + start_x - block_start_x;
+        int sy4 =
+            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+        for (int y = 0; y < 8; ++y) {
+          int sy = sy4 - MultiplyBy4(gamma);
+#if defined(__aarch64__)
+          const int16x8_t intermediate =
+              vld1q_s16(&intermediate_result_column[y]);
+          int16_t tmp[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]);
+            const int32x4_t product_low =
+                vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate));
+            const int32x4_t product_high =
+                vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate));
+            // vaddvq_s32 is only available on __aarch64__.
+            const int32_t sum =
+                vaddvq_s32(product_low) + vaddvq_s32(product_high);
+            const int16_t sum_descale =
+                RightShiftWithRounding(sum, kRoundBitsVertical);
+            if (is_compound) {
+              dst_row[x] = sum_descale;
+            } else {
+              tmp[x] = sum_descale;
+            }
+            sy += gamma;
+          }
+          if (!is_compound) {
+            const int16x8_t sum = vld1q_s16(tmp);
+            vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+          }
+#else  // !defined(__aarch64__)
+          int16x8_t filter[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            filter[x] = vld1q_s16(kWarpedFilters[offset]);
+            sy += gamma;
+          }
+          Transpose8x8(filter);
+          int32x4_t sum_low = vdupq_n_s32(0);
+          int32x4_t sum_high = sum_low;
+          for (int k = 0; k < 8; ++k) {
+            const int16_t intermediate = intermediate_result_column[y + k];
+            sum_low =
+                vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate);
+            sum_high =
+                vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate);
+          }
+          const int16x8_t sum =
+              vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                           vrshrn_n_s32(sum_high, kRoundBitsVertical));
+          if (is_compound) {
+            vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+          } else {
+            vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+          }
+#endif  // defined(__aarch64__)
+          dst_row += dest_stride;
+          sy4 += delta;
+        }
+        // End of region 2. Continue the |start_x| do-while loop.
+        start_x += 8;
+        continue;
+      }
+
+      // Regions 3 and 4.
+      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+      // In two special cases, iy4 + y is clipped to either 0 or
+      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+      // bounded and we can avoid clipping iy4 + y by relying on a reference
+      // frame's boundary extension on the top and bottom.
+      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+        // Region 3.
+        // Horizontal filter.
+        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const uint8_t* const src_row = src + row * source_stride;
+        // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+        // read but is ignored.
+        //
+        // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+        // bytes after src_row[source_width - 1]. We assume the source frame
+        // has left and right borders of at least 13 bytes that extend the
+        // frame boundary pixels. We also assume there is at least one extra
+        // padding byte after the right border of the last source row.
+        const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+        // Convert src_row_v to int8 (subtract 128).
+        const int8x16_t src_row_centered =
+            vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          HorizontalFilter(sx4, alpha, src_row_centered,
+                           intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      } else {
+        // Region 4.
+        // Horizontal filter.
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = iy4 + y;
+          const uint8_t* const src_row = src + row * source_stride;
+          // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+          // read but is ignored.
+          //
+          // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+          // bytes after src_row[source_width - 1]. We assume the source frame
+          // has left and right borders of at least 13 bytes that extend the
+          // frame boundary pixels. We also assume there is at least one extra
+          // padding byte after the right border of the last source row.
+          const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+          // Convert src_row_v to int8 (subtract 128).
+          const int8x16_t src_row_centered =
+              vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+          HorizontalFilter(sx4, alpha, src_row_centered,
+                           intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      }
+
+      // Regions 3 and 4.
+      // Vertical filter.
+      DestType* dst_row = dst + start_x - block_start_x;
+      int sy4 =
+          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+      for (int y = 0; y < 8; ++y) {
+        int sy = sy4 - MultiplyBy4(gamma);
+        int16x8_t filter[8];
+        for (int x = 0; x < 8; ++x) {
+          const int offset =
+              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+              kWarpedPixelPrecisionShifts;
+          filter[x] = vld1q_s16(kWarpedFilters[offset]);
+          sy += gamma;
+        }
+        Transpose8x8(filter);
+        int32x4_t sum_low = vdupq_n_s32(-kOffsetRemoval);
+        int32x4_t sum_high = sum_low;
+        for (int k = 0; k < 8; ++k) {
+          const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]);
+          sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
+                              vget_low_s16(intermediate));
+          sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
+                               vget_high_s16(intermediate));
+        }
+        const int16x8_t sum =
+            vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                         vrshrn_n_s32(sum_high, kRoundBitsVertical));
+        if (is_compound) {
+          vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+        } else {
+          vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+        }
+        dst_row += dest_stride;
+        sy4 += delta;
+      }
+      start_x += 8;
+    } while (start_x < block_start_x + block_width);
+    dst += 8 * dest_stride;
+    start_y += 8;
+  } while (start_y < block_start_y + block_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->warp = Warp_NEON</*is_compound=*/false>;
+  dsp->warp_compound = Warp_NEON</*is_compound=*/true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void WarpInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/warp_neon.h b/src/dsp/arm/warp_neon.h
new file mode 100644
index 0000000..dbcaa23
--- /dev/null
+++ b/src/dsp/arm/warp_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
new file mode 100644
index 0000000..49d3be0
--- /dev/null
+++ b/src/dsp/arm/weight_mask_neon.cc
@@ -0,0 +1,463 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/weight_mask_neon.h"
+
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kRoundingBits8bpp = 4;
+
+template <bool mask_is_inverse>
+inline void WeightMask8_NEON(const int16_t* prediction_0,
+                             const int16_t* prediction_1, uint8_t* mask) {
+  const int16x8_t pred_0 = vld1q_s16(prediction_0);
+  const int16x8_t pred_1 = vld1q_s16(prediction_1);
+  const uint8x8_t difference_offset = vdup_n_u8(38);
+  const uint8x8_t mask_ceiling = vdup_n_u8(64);
+  const uint16x8_t difference = vrshrq_n_u16(
+      vreinterpretq_u16_s16(vabdq_s16(pred_0, pred_1)), kRoundingBits8bpp);
+  const uint8x8_t adjusted_difference =
+      vqadd_u8(vqshrn_n_u16(difference, 4), difference_offset);
+  const uint8x8_t mask_value = vmin_u8(adjusted_difference, mask_ceiling);
+  if (mask_is_inverse) {
+    const uint8x8_t inverted_mask_value = vsub_u8(mask_ceiling, mask_value);
+    vst1_u8(mask, inverted_mask_value);
+  } else {
+    vst1_u8(mask, mask_value);
+  }
+}
+
+#define WEIGHT8_WITHOUT_STRIDE \
+  WeightMask8_NEON<mask_is_inverse>(pred_0, pred_1, mask)
+
+#define WEIGHT8_AND_STRIDE \
+  WEIGHT8_WITHOUT_STRIDE;  \
+  pred_0 += 8;             \
+  pred_1 += 8;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask8x8_NEON(const void* prediction_0, const void* prediction_1,
+                        uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+  } while (++y < 7);
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_NEON(const void* prediction_0, const void* prediction_1,
+                         uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_NEON(const void* prediction_0, const void* prediction_1,
+                         uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT8_AND_STRIDE;
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE                            \
+  WeightMask8_NEON<mask_is_inverse>(pred_0, pred_1, mask); \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+
+#define WEIGHT16_AND_STRIDE \
+  WEIGHT16_WITHOUT_STRIDE;  \
+  pred_0 += 16;             \
+  pred_1 += 16;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_NEON(const void* prediction_0, const void* prediction_1,
+                         uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+  } while (++y < 7);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT16_AND_STRIDE;
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE                                           \
+  WeightMask8_NEON<mask_is_inverse>(pred_0, pred_1, mask);                \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8);    \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+
+#define WEIGHT32_AND_STRIDE \
+  WEIGHT32_WITHOUT_STRIDE;  \
+  pred_0 += 32;             \
+  pred_1 += 32;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_NEON(const void* prediction_0, const void* prediction_1,
+                         uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE                                           \
+  WeightMask8_NEON<mask_is_inverse>(pred_0, pred_1, mask);                \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8);    \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
+  WeightMask8_NEON<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+
+#define WEIGHT64_AND_STRIDE \
+  WEIGHT64_WITHOUT_STRIDE;  \
+  pred_0 += 64;             \
+  pred_1 += 64;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_NEON(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_NEON(const void* prediction_0, const void* prediction_1,
+                           uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 42);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_NEON(const void* prediction_0, const void* prediction_1,
+                           uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_NEON(const void* prediction_0, const void* prediction_1,
+                            uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 42);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                      \
+      WeightMask##width##x##height##_NEON<0>;                  \
+  dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_NEON<1>
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/weight_mask_neon.h b/src/dsp/arm/weight_mask_neon.h
new file mode 100644
index 0000000..b4749ec
--- /dev/null
+++ b/src/dsp/arm/weight_mask_neon.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc
new file mode 100644
index 0000000..a59abb0
--- /dev/null
+++ b/src/dsp/average_blend.cc
@@ -0,0 +1,101 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void AverageBlend_C(const void* prediction_0, const void* prediction_1,
+                    const int width, const int height, void* const dest,
+                    const ptrdiff_t dest_stride) {
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // See warp.cc and convolve.cc for detailed prediction ranges.
+      int res = pred_0[x] + pred_1[x];
+      res -= (bitdepth == 8) ? 0 : kCompoundOffset + kCompoundOffset;
+      dst[x] = static_cast<Pixel>(
+          Clip3(RightShiftWithRounding(res, inter_post_round_bits + 1), 0,
+                (1 << bitdepth) - 1));
+    } while (++x < width);
+
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#endif
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void AverageBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/average_blend.h b/src/dsp/average_blend.h
new file mode 100644
index 0000000..02ecd09
--- /dev/null
+++ b/src/dsp/average_blend.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+#define LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/average_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/average_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
diff --git a/src/dsp/cdef.cc b/src/dsp/cdef.cc
new file mode 100644
index 0000000..0b50517
--- /dev/null
+++ b/src/dsp/cdef.cc
@@ -0,0 +1,306 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Silence unused function warnings when CdefDirection_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||        \
+    !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
+    (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105};
+
+int32_t Square(int32_t x) { return x * x; }
+
+template <int bitdepth, typename Pixel>
+void CdefDirection_C(const void* const source, ptrdiff_t stride,
+                     uint8_t* const direction, int* const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const Pixel*>(source);
+  stride /= sizeof(Pixel);
+  int32_t cost[8] = {};
+  // |partial| does not have to be int32_t for 8bpp. int16_t will suffice. We
+  // use int32_t to keep it simple since |cost| will have to be int32_t.
+  int32_t partial[8][15] = {};
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      const int x = (src[j] >> (bitdepth - 8)) - 128;
+      partial[0][i + j] += x;
+      partial[1][i + j / 2] += x;
+      partial[2][i] += x;
+      partial[3][3 + i - j / 2] += x;
+      partial[4][7 + i - j] += x;
+      partial[5][3 - i / 2 + j] += x;
+      partial[6][j] += x;
+      partial[7][i / 2 + j] += x;
+    }
+    src += stride;
+  }
+  for (int i = 0; i < 8; ++i) {
+    cost[2] += Square(partial[2][i]);
+    cost[6] += Square(partial[6][i]);
+  }
+  cost[2] *= kDivisionTable[7];
+  cost[6] *= kDivisionTable[7];
+  for (int i = 0; i < 7; ++i) {
+    cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+               kDivisionTable[i];
+    cost[4] += (Square(partial[4][i]) + Square(partial[4][14 - i])) *
+               kDivisionTable[i];
+  }
+  cost[0] += Square(partial[0][7]) * kDivisionTable[7];
+  cost[4] += Square(partial[4][7]) * kDivisionTable[7];
+  for (int i = 1; i < 8; i += 2) {
+    for (int j = 0; j < 5; ++j) {
+      cost[i] += Square(partial[i][3 + j]);
+    }
+    cost[i] *= kDivisionTable[7];
+    for (int j = 0; j < 3; ++j) {
+      cost[i] += (Square(partial[i][j]) + Square(partial[i][10 - j])) *
+                 kDivisionTable[2 * j + 1];
+    }
+  }
+  int32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||
+        // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+        // !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+
+// Silence unused function warnings when CdefFilter_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||      \
+    !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \
+    (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+
+int Constrain(int diff, int threshold, int damping) {
+  assert(threshold != 0);
+  damping = std::max(0, damping - FloorLog2(threshold));
+  const int sign = (diff < 0) ? -1 : 1;
+  return sign *
+         Clip3(threshold - (std::abs(diff) >> damping), 0, std::abs(diff));
+}
+
+// Filters the source block. It doesn't check whether the candidate pixel is
+// inside the frame. However it requires the source input to be padded with a
+// constant large value (kCdefLargeValue) if at the boundary.
+template <int block_width, int bitdepth, typename Pixel,
+          bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_C(const uint16_t* src, const ptrdiff_t src_stride,
+                  const int block_height, const int primary_strength,
+                  const int secondary_strength, const int damping,
+                  const int direction, void* const dest,
+                  const ptrdiff_t dest_stride) {
+  static_assert(block_width == 4 || block_width == 8, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  assert(block_height == 4 || block_height == 8);
+  assert(direction >= 0 && direction <= 7);
+  constexpr int coeff_shift = bitdepth - 8;
+  // Section 5.9.19. CDEF params syntax.
+  assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift);
+  assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift &&
+         secondary_strength != 3 << coeff_shift);
+  assert(primary_strength != 0 || secondary_strength != 0);
+  // damping is decreased by 1 for chroma.
+  assert((damping >= 3 && damping <= 6 + coeff_shift) ||
+         (damping >= 2 && damping <= 5 + coeff_shift));
+  // When only primary_strength or secondary_strength are non-zero the number
+  // of pixels inspected (4 for primary_strength, 8 for secondary_strength) and
+  // the taps used don't exceed the amount the sum is
+  // descaled by (16) so we can skip tracking and clipping to the minimum and
+  // maximum value observed.
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
+                                                kCdefSecondaryTap1};
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+  int y = block_height;
+  do {
+    int x = 0;
+    do {
+      int16_t sum = 0;
+      const uint16_t pixel_value = src[x];
+      uint16_t max_value = pixel_value;
+      uint16_t min_value = pixel_value;
+      for (int k = 0; k < 2; ++k) {
+        static constexpr int signs[] = {-1, 1};
+        for (const int& sign : signs) {
+          if (enable_primary) {
+            const int dy = sign * kCdefDirections[direction][k][0];
+            const int dx = sign * kCdefDirections[direction][k][1];
+            const uint16_t value = src[dy * src_stride + dx + x];
+            // Note: the summation can ignore the condition check in SIMD
+            // implementation, because Constrain() will return 0 when
+            // value == kCdefLargeValue.
+            if (value != kCdefLargeValue) {
+              sum += Constrain(value - pixel_value, primary_strength, damping) *
+                     kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][k];
+              if (clipping_required) {
+                max_value = std::max(value, max_value);
+                min_value = std::min(value, min_value);
+              }
+            }
+          }
+
+          if (enable_secondary) {
+            static constexpr int offsets[] = {-2, 2};
+            for (const int& offset : offsets) {
+              const int dy = sign * kCdefDirections[direction + offset][k][0];
+              const int dx = sign * kCdefDirections[direction + offset][k][1];
+              const uint16_t value = src[dy * src_stride + dx + x];
+              // Note: the summation can ignore the condition check in SIMD
+              // implementation.
+              if (value != kCdefLargeValue) {
+                sum += Constrain(value - pixel_value, secondary_strength,
+                                 damping) *
+                       kCdefSecondaryTaps[k];
+                if (clipping_required) {
+                  max_value = std::max(value, max_value);
+                  min_value = std::min(value, min_value);
+                }
+              }
+            }
+          }
+        }
+      }
+
+      const int offset = (8 + sum - (sum < 0)) >> 4;
+      if (clipping_required) {
+        dst[x] = static_cast<Pixel>(
+            Clip3(pixel_value + offset, min_value, max_value));
+      } else {
+        dst[x] = static_cast<Pixel>(pixel_value + offset);
+      }
+    } while (++x < block_width);
+
+    src += src_stride;
+    dst += dst_stride;
+  } while (--y != 0);
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||
+        // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+        // !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+  dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+  dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+  dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+  dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+  dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_CdefDirection
+  dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_CdefFilters
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void CdefInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h
new file mode 100644
index 0000000..2d70d2c
--- /dev/null
+++ b/src/dsp/cdef.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CDEF_H_
+#define LIBGAV1_SRC_DSP_CDEF_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/cdef_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/cdef_sse4.h"
+// clang-format on
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CDEF_H_
diff --git a/src/dsp/cdef.inc b/src/dsp/cdef.inc
new file mode 100644
index 0000000..c1a3136
--- /dev/null
+++ b/src/dsp/cdef.inc
@@ -0,0 +1,29 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for cdef implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2;
+
+// Mirror values and pad to 16 elements.
+alignas(16) constexpr uint32_t kCdefDivisionTable[] = {
+    840, 420, 280, 210, 168, 140, 120, 105,
+    120, 140, 168, 210, 280, 420, 840, 0};
+
+// Used when calculating odd |cost[x]| values to mask off unwanted elements.
+// Holds elements 1 3 5 X 5 3 1 X
+alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0,
+                                                          140, 210, 420, 0};
diff --git a/src/dsp/common.h b/src/dsp/common.h
new file mode 100644
index 0000000..d614a81
--- /dev/null
+++ b/src/dsp/common.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_COMMON_H_
+#define LIBGAV1_SRC_DSP_COMMON_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum { kSgrStride = kRestorationUnitWidth + 32 };  // anonymous enum
+
+// Self guided projection filter.
+struct SgrProjInfo {
+  int index;
+  int multiplier[2];
+};
+
+struct WienerInfo {
+  static const int kVertical = 0;
+  static const int kHorizontal = 1;
+  int16_t number_leading_zero_coefficients[2];
+  alignas(kMaxAlignment) int16_t filter[2][(kWienerFilterTaps + 1) / 2];
+};
+
+struct RestorationUnitInfo : public MaxAlignedAllocable {
+  LoopRestorationType type;
+  SgrProjInfo sgr_proj_info;
+  WienerInfo wiener_info;
+};
+
+struct SgrBuffer {
+  alignas(kMaxAlignment) uint16_t sum3[4 * kSgrStride];
+  alignas(kMaxAlignment) uint16_t sum5[5 * kSgrStride];
+  alignas(kMaxAlignment) uint32_t square_sum3[4 * kSgrStride];
+  alignas(kMaxAlignment) uint32_t square_sum5[5 * kSgrStride];
+  alignas(kMaxAlignment) uint16_t ma343[4 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint16_t ma444[3 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint16_t ma565[2 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b343[4 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b444[3 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b565[2 * kRestorationUnitWidth];
+  // The following 2 buffers are only used by the C functions. Since SgrBuffer
+  // is smaller than |wiener_buffer| in RestorationBuffer which is an union,
+  // it's OK to always keep the following 2 buffers.
+  alignas(kMaxAlignment) uint8_t ma[kSgrStride];  // [0, 255]
+  // b is less than 2^16 for 8-bit. However, making it a template slows down the
+  // C function by 5%. So b is fixed to 32-bit.
+  alignas(kMaxAlignment) uint32_t b[kSgrStride];
+};
+
+union RestorationBuffer {
+  // For self-guided filter.
+  SgrBuffer sgr_buffer;
+  // For wiener filter.
+  // The array |intermediate| in Section 7.17.4, the intermediate results
+  // between the horizontal and vertical filters.
+  alignas(kMaxAlignment) int16_t
+      wiener_buffer[(kRestorationUnitHeight + kWienerFilterTaps - 1) *
+                    kRestorationUnitWidth];
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_COMMON_H_
diff --git a/src/dsp/constants.cc b/src/dsp/constants.cc
new file mode 100644
index 0000000..0099ca3
--- /dev/null
+++ b/src/dsp/constants.cc
@@ -0,0 +1,103 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/constants.h"
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// Each set of 7 taps is padded with a 0 to easily align and pack into the high
+// and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
+const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+    {{-6, 10, 0, 0, 0, 12, 0, 0},
+     {-5, 2, 10, 0, 0, 9, 0, 0},
+     {-3, 1, 1, 10, 0, 7, 0, 0},
+     {-3, 1, 1, 2, 10, 5, 0, 0},
+     {-4, 6, 0, 0, 0, 2, 12, 0},
+     {-3, 2, 6, 0, 0, 2, 9, 0},
+     {-3, 2, 2, 6, 0, 2, 7, 0},
+     {-3, 1, 2, 2, 6, 3, 5, 0}},
+    {{-10, 16, 0, 0, 0, 10, 0, 0},
+     {-6, 0, 16, 0, 0, 6, 0, 0},
+     {-4, 0, 0, 16, 0, 4, 0, 0},
+     {-2, 0, 0, 0, 16, 2, 0, 0},
+     {-10, 16, 0, 0, 0, 0, 10, 0},
+     {-6, 0, 16, 0, 0, 0, 6, 0},
+     {-4, 0, 0, 16, 0, 0, 4, 0},
+     {-2, 0, 0, 0, 16, 0, 2, 0}},
+    {{-8, 8, 0, 0, 0, 16, 0, 0},
+     {-8, 0, 8, 0, 0, 16, 0, 0},
+     {-8, 0, 0, 8, 0, 16, 0, 0},
+     {-8, 0, 0, 0, 8, 16, 0, 0},
+     {-4, 4, 0, 0, 0, 0, 16, 0},
+     {-4, 0, 4, 0, 0, 0, 16, 0},
+     {-4, 0, 0, 4, 0, 0, 16, 0},
+     {-4, 0, 0, 0, 4, 0, 16, 0}},
+    {{-2, 8, 0, 0, 0, 10, 0, 0},
+     {-1, 3, 8, 0, 0, 6, 0, 0},
+     {-1, 2, 3, 8, 0, 4, 0, 0},
+     {0, 1, 2, 3, 8, 2, 0, 0},
+     {-1, 4, 0, 0, 0, 3, 10, 0},
+     {-1, 3, 4, 0, 0, 4, 6, 0},
+     {-1, 2, 3, 4, 0, 4, 4, 0},
+     {-1, 2, 2, 3, 4, 3, 3, 0}},
+    {{-12, 14, 0, 0, 0, 14, 0, 0},
+     {-10, 0, 14, 0, 0, 12, 0, 0},
+     {-9, 0, 0, 14, 0, 11, 0, 0},
+     {-8, 0, 0, 0, 14, 10, 0, 0},
+     {-10, 12, 0, 0, 0, 0, 14, 0},
+     {-9, 1, 12, 0, 0, 0, 12, 0},
+     {-8, 0, 0, 12, 0, 1, 11, 0},
+     {-7, 0, 0, 1, 12, 1, 9, 0}}};
+
+// A lookup table replacing the calculation of the variable s in Section 7.17.3
+// (Box filter process). The first index is sgr_proj_index (the lr_sgr_set
+// syntax element in the Spec, saved in the sgr_proj_info.index field of a
+// RestorationUnitInfo struct). The second index is pass (0 or 1).
+//
+// const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
+// const uint32_t n2_with_scale = n * n * scale;
+// const uint32_t s =
+// ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
+// 0 is an invalid value, corresponding to radius = 0, where the filter is
+// skipped.
+const uint16_t kSgrScaleParameter[16][2] = {
+    {140, 3236}, {112, 2158}, {93, 1618}, {80, 1438}, {70, 1295}, {58, 1177},
+    {47, 1079},  {37, 996},   {30, 925},  {25, 863},  {0, 2589},  {0, 1618},
+    {0, 1177},   {0, 925},    {56, 0},    {22, 0},
+};
+
+const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}};
+
+// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+// beginning and end of the table. The cdef direction range is [0, 7] and the
+// first index is offset +/-2. This removes the need to constrain the first
+// index to the same range using e.g., & 7.
+const int8_t kCdefDirectionsPadded[12][2][2] = {
+    {{1, 0}, {2, 0}},    // Padding: Cdef_Directions[6]
+    {{1, 0}, {2, -1}},   // Padding: Cdef_Directions[7]
+    {{-1, 1}, {-2, 2}},  // Begin Cdef_Directions
+    {{0, 1}, {-1, 2}},   //
+    {{0, 1}, {0, 2}},    //
+    {{0, 1}, {1, 2}},    //
+    {{1, 1}, {2, 2}},    //
+    {{1, 0}, {2, 1}},    //
+    {{1, 0}, {2, 0}},    //
+    {{1, 0}, {2, -1}},   // End Cdef_Directions
+    {{-1, 1}, {-2, 2}},  // Padding: Cdef_Directions[0]
+    {{0, 1}, {-1, 2}},   // Padding: Cdef_Directions[1]
+};
+
+}  // namespace libgav1
diff --git a/src/dsp/constants.h b/src/dsp/constants.h
new file mode 100644
index 0000000..7c1b62c
--- /dev/null
+++ b/src/dsp/constants.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONSTANTS_H_
+#define LIBGAV1_SRC_DSP_CONSTANTS_H_
+
+// This file contains DSP related constants that have a direct relationship with
+// a DSP component.
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+enum {
+  // Documentation variables.
+  kBitdepth8 = 8,
+  kBitdepth10 = 10,
+  kBitdepth12 = 12,
+  // Weights are quadratic from '1' to '1 / block_size', scaled by
+  // 2^kSmoothWeightScale.
+  kSmoothWeightScale = 8,
+  kCflLumaBufferStride = 32,
+  // InterRound0, Section 7.11.3.2.
+  kInterRoundBitsHorizontal = 3,  // 8 & 10-bit.
+  kInterRoundBitsHorizontal12bpp = 5,
+  kInterRoundBitsCompoundVertical = 7,  // 8, 10 & 12-bit compound prediction.
+  kInterRoundBitsVertical = 11,         // 8 & 10-bit, single prediction.
+  kInterRoundBitsVertical12bpp = 9,
+  // Offset applied to 10bpp and 12bpp predictors to allow storing them in
+  // uint16_t. Removed before blending.
+  kCompoundOffset = (1 << 14) + (1 << 13),
+  kCdefSecondaryTap0 = 2,
+  kCdefSecondaryTap1 = 1,
+};  // anonymous enum
+
+extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8];
+
+// Values in this enum can be derived as the sum of subsampling_x and
+// subsampling_y (since subsampling_x == 0 && subsampling_y == 1 case is never
+// allowed by the bitstream).
+enum SubsamplingType : uint8_t {
+  kSubsamplingType444,  // subsampling_x = 0, subsampling_y = 0.
+  kSubsamplingType422,  // subsampling_x = 1, subsampling_y = 0.
+  kSubsamplingType420,  // subsampling_x = 1, subsampling_y = 1.
+  kNumSubsamplingTypes
+};
+
+extern const uint16_t kSgrScaleParameter[16][2];
+
+extern const uint8_t kCdefPrimaryTaps[2][2];
+
+extern const int8_t kCdefDirectionsPadded[12][2][2];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CONSTANTS_H_
diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc
new file mode 100644
index 0000000..8c6f68f
--- /dev/null
+++ b/src/dsp/convolve.cc
@@ -0,0 +1,876 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kHorizontalOffset = 3;
+constexpr int kVerticalOffset = 3;
+
+// Compound prediction output ranges from ConvolveTest.ShowRange.
+// Bitdepth:  8 Input range:            [       0,      255]
+//   intermediate range:                [   -7140,    23460]
+//   first pass output range:           [   -1785,     5865]
+//   intermediate range:                [ -328440,   589560]
+//   second pass output range:          [       0,      255]
+//   compound second pass output range: [   -5132,     9212]
+//
+// Bitdepth: 10 Input range:            [       0,     1023]
+//   intermediate range:                [  -28644,    94116]
+//   first pass output range:           [   -7161,    23529]
+//   intermediate range:                [-1317624,  2365176]
+//   second pass output range:          [       0,     1023]
+//   compound second pass output range: [    3988,    61532]
+//
+// Bitdepth: 12 Input range:            [       0,     4095]
+//   intermediate range:                [ -114660,   376740]
+//   first pass output range:           [   -7166,    23546]
+//   intermediate range:                [-1318560,  2366880]
+//   second pass output range:          [       0,     4095]
+//   compound second pass output range: [    3974,    61559]
+
+template <int bitdepth, typename Pixel>
+void ConvolveScale2D_C(const void* const reference,
+                       const ptrdiff_t reference_stride,
+                       const int horizontal_filter_index,
+                       const int vertical_filter_index, const int subpixel_x,
+                       const int subpixel_y, const int step_x, const int step_y,
+                       const int width, const int height, void* prediction,
+                       const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      kSubPixelTaps;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (2 * kMaxSuperBlockSizeInPixels + 8)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+  const int max_pixel_value = (1 << bitdepth) - 1;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  // Note: assume the input src is already aligned to the correct start
+  // position.
+  int y = 0;
+  do {
+    int p = subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+      const int filter_id = (p >> 6) & kSubPixelMask;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      p += step_x;
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  int p = subpixel_y & 1023;
+  y = 0;
+  do {
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum +=
+            kHalfSubPixelFilters[filter_index][filter_id][k] *
+            intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+                         x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    dest += dest_stride;
+    p += step_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundScale2D_C(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int horizontal_filter_index,
+                               const int vertical_filter_index,
+                               const int subpixel_x, const int subpixel_y,
+                               const int step_x, const int step_y,
+                               const int width, const int height,
+                               void* prediction, const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      kSubPixelTaps;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (2 * kMaxSuperBlockSizeInPixels + 8)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  // Note: assume the input src is already aligned to the correct start
+  // position.
+  int y = 0;
+  do {
+    int p = subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+      const int filter_id = (p >> 6) & kSubPixelMask;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      p += step_x;
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  int p = subpixel_y & 1023;
+  y = 0;
+  do {
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum +=
+            kHalfSubPixelFilters[filter_index][filter_id][k] *
+            intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+                         x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    dest += pred_stride;
+    p += step_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompound2D_C(const void* const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index,
+                          const int horizontal_filter_id,
+                          const int vertical_filter_id, const int width,
+                          const int height, void* prediction,
+                          const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+  const int intermediate_height = height + kSubPixelTaps - 1;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src = static_cast<const Pixel*>(reference) -
+                    kVerticalOffset * src_stride - kHorizontalOffset;
+  auto* dest = static_cast<uint16_t*>(prediction);
+
+  // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+  assert(vertical_filter_id != 0);
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               intermediate[k * intermediate_stride + x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    dest += pred_stride;
+    intermediate += intermediate_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is single prediction mode, where both horizontal and
+// vertical filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void Convolve2D_C(const void* const reference, const ptrdiff_t reference_stride,
+                  const int horizontal_filter_index,
+                  const int vertical_filter_index,
+                  const int horizontal_filter_id, const int vertical_filter_id,
+                  const int width, const int height, void* prediction,
+                  const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  const int intermediate_height = height + kSubPixelTaps - 1;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+  const int max_pixel_value = (1 << bitdepth) - 1;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src = static_cast<const Pixel*>(reference) -
+                    kVerticalOffset * src_stride - kHorizontalOffset;
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+  assert(vertical_filter_id != 0);
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               intermediate[k * intermediate_stride + x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    dest += dest_stride;
+    intermediate += intermediate_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only horizontal
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveHorizontal_C(const void* const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int /*vertical_filter_index*/,
+                          const int horizontal_filter_id,
+                          const int /*vertical_filter_id*/, const int width,
+                          const int height, void* prediction,
+                          const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int bits = kFilterBits - kRoundBitsHorizontal;
+  const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int max_pixel_value = (1 << bitdepth) - 1;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveVertical_C(const void* const reference,
+                        const ptrdiff_t reference_stride,
+                        const int /*horizontal_filter_index*/,
+                        const int vertical_filter_index,
+                        const int /*horizontal_filter_id*/,
+                        const int vertical_filter_id, const int width,
+                        const int height, void* prediction,
+                        const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src =
+      static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  // Copy filters must call ConvolveCopy().
+  assert(vertical_filter_id != 0);
+
+  const int max_pixel_value = (1 << bitdepth) - 1;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               src[k * src_stride + x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCopy_C(const void* const reference,
+                    const ptrdiff_t reference_stride,
+                    const int /*horizontal_filter_index*/,
+                    const int /*vertical_filter_index*/,
+                    const int /*horizontal_filter_id*/,
+                    const int /*vertical_filter_id*/, const int width,
+                    const int height, void* prediction,
+                    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  int y = 0;
+  do {
+    memcpy(dest, src, width * sizeof(Pixel));
+    src += reference_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundCopy_C(const void* const reference,
+                            const ptrdiff_t reference_stride,
+                            const int /*horizontal_filter_index*/,
+                            const int /*vertical_filter_index*/,
+                            const int /*horizontal_filter_id*/,
+                            const int /*vertical_filter_id*/, const int width,
+                            const int height, void* prediction,
+                            const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsVertical =
+      ((bitdepth == 12) ? kInterRoundBitsVertical12bpp
+                        : kInterRoundBitsVertical) -
+      kInterRoundBitsCompoundVertical;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = (bitdepth == 8) ? 0 : ((1 << bitdepth) + (1 << (bitdepth - 1)));
+      sum += src[x];
+      dest[x] = sum << kRoundBitsVertical;
+    } while (++x < width);
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only horizontal
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundHorizontal_C(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int horizontal_filter_index, const int /*vertical_filter_index*/,
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  // Copy filters must call ConvolveCopy().
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only vertical
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundVertical_C(const void* const reference,
+                                const ptrdiff_t reference_stride,
+                                const int /*horizontal_filter_index*/,
+                                const int vertical_filter_index,
+                                const int /*horizontal_filter_id*/,
+                                const int vertical_filter_id, const int width,
+                                const int height, void* prediction,
+                                const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src =
+      static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  // Copy filters must call ConvolveCopy().
+  assert(vertical_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               src[k * src_stride + x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from current frame and both horizontal and vertical
+// filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveIntraBlockCopy2D_C(const void* const reference,
+                                const ptrdiff_t reference_stride,
+                                const int /*horizontal_filter_index*/,
+                                const int /*vertical_filter_index*/,
+                                const int /*horizontal_filter_id*/,
+                                const int /*vertical_filter_id*/,
+                                const int width, const int height,
+                                void* prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int intermediate_height = height + 1;
+  uint16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                               (kMaxSuperBlockSizeInPixels + 1)];
+  uint16_t* intermediate = intermediate_result;
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      intermediate[x] = src[x] + src[x + 1];
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += width;
+  } while (++y < intermediate_height);
+
+  intermediate = intermediate_result;
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      dest[x] =
+          RightShiftWithRounding(intermediate[x] + intermediate[x + width], 2);
+    } while (++x < width);
+
+    intermediate += width;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from the current frame and only horizontal or vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+// The filtering of intra block copy is simply the average of current and
+// the next pixel.
+template <int bitdepth, typename Pixel, bool is_horizontal>
+void ConvolveIntraBlockCopy1D_C(const void* const reference,
+                                const ptrdiff_t reference_stride,
+                                const int /*horizontal_filter_index*/,
+                                const int /*vertical_filter_index*/,
+                                const int /*horizontal_filter_id*/,
+                                const int /*vertical_filter_id*/,
+                                const int width, const int height,
+                                void* prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const ptrdiff_t offset = is_horizontal ? 1 : src_stride;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      dest[x] = RightShiftWithRounding(src[x] + src[x + offset], 1);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+  dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCopy
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+  dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+#endif
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+  dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+  dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCopy
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveHorizontal
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveVertical
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Convolve2D
+  dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundCopy
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundVertical
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompound2D
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockHorizontal
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockVertical
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlock2D
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+#endif
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveScale2D
+  dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void ConvolveInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/convolve.h b/src/dsp/convolve.h
new file mode 100644
index 0000000..5bc0bad
--- /dev/null
+++ b/src/dsp/convolve.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_
+#define LIBGAV1_SRC_DSP_CONVOLVE_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/convolve_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/convolve_avx2.h"
+#include "src/dsp/x86/convolve_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve and Dsp::convolve_scale. This function is not
+// thread-safe.
+void ConvolveInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CONVOLVE_H_
diff --git a/src/dsp/convolve.inc b/src/dsp/convolve.inc
new file mode 100644
index 0000000..140648b
--- /dev/null
+++ b/src/dsp/convolve.inc
@@ -0,0 +1,50 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+int GetNumTapsInFilter(const int filter_index) {
+  if (filter_index < 2) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
+constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
+constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;
diff --git a/src/dsp/distance_weighted_blend.cc b/src/dsp/distance_weighted_blend.cc
new file mode 100644
index 0000000..a035fbe
--- /dev/null
+++ b/src/dsp/distance_weighted_blend.cc
@@ -0,0 +1,101 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlend_C(const void* prediction_0, const void* prediction_1,
+                             const uint8_t weight_0, const uint8_t weight_1,
+                             const int width, const int height,
+                             void* const dest, const ptrdiff_t dest_stride) {
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // See warp.cc and convolve.cc for detailed prediction ranges.
+      // weight_0 + weight_1 = 16.
+      int res = pred_0[x] * weight_0 + pred_1[x] * weight_1;
+      res -= (bitdepth == 8) ? 0 : kCompoundOffset * 16;
+      dst[x] = static_cast<Pixel>(
+          Clip3(RightShiftWithRounding(res, inter_post_round_bits + 4), 0,
+                (1 << bitdepth) - 1));
+    } while (++x < width);
+
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void DistanceWeightedBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend.h b/src/dsp/distance_weighted_blend.h
new file mode 100644
index 0000000..1a782b6
--- /dev/null
+++ b/src/dsp/distance_weighted_blend.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+#define LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/distance_weighted_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/distance_weighted_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
new file mode 100644
index 0000000..5b54c4e
--- /dev/null
+++ b/src/dsp/dsp.cc
@@ -0,0 +1,150 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/arm/weight_mask_neon.h"
+#include "src/dsp/average_blend.h"
+#include "src/dsp/cdef.h"
+#include "src/dsp/convolve.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/film_grain.h"
+#include "src/dsp/intra_edge.h"
+#include "src/dsp/intrapred.h"
+#include "src/dsp/inverse_transform.h"
+#include "src/dsp/loop_filter.h"
+#include "src/dsp/loop_restoration.h"
+#include "src/dsp/mask_blend.h"
+#include "src/dsp/motion_field_projection.h"
+#include "src/dsp/motion_vector_search.h"
+#include "src/dsp/obmc.h"
+#include "src/dsp/super_res.h"
+#include "src/dsp/warp.h"
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp_internal {
+
+dsp::Dsp* GetWritableDspTable(int bitdepth) {
+  switch (bitdepth) {
+    case 8: {
+      static dsp::Dsp dsp_8bpp;
+      return &dsp_8bpp;
+    }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    case 10: {
+      static dsp::Dsp dsp_10bpp;
+      return &dsp_10bpp;
+    }
+#endif
+  }
+  return nullptr;
+}
+
+}  // namespace dsp_internal
+
+namespace dsp {
+
+void DspInit() {
+  static std::once_flag once;
+  std::call_once(once, []() {
+    AverageBlendInit_C();
+    CdefInit_C();
+    ConvolveInit_C();
+    DistanceWeightedBlendInit_C();
+    FilmGrainInit_C();
+    IntraEdgeInit_C();
+    IntraPredInit_C();
+    InverseTransformInit_C();
+    LoopFilterInit_C();
+    LoopRestorationInit_C();
+    MaskBlendInit_C();
+    MotionFieldProjectionInit_C();
+    MotionVectorSearchInit_C();
+    ObmcInit_C();
+    SuperResInit_C();
+    WarpInit_C();
+    WeightMaskInit_C();
+#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+    const uint32_t cpu_features = GetCpuInfo();
+#if LIBGAV1_ENABLE_SSE4_1
+    if ((cpu_features & kSSE4_1) != 0) {
+      AverageBlendInit_SSE4_1();
+      CdefInit_SSE4_1();
+      ConvolveInit_SSE4_1();
+      DistanceWeightedBlendInit_SSE4_1();
+      IntraEdgeInit_SSE4_1();
+      IntraPredInit_SSE4_1();
+      IntraPredCflInit_SSE4_1();
+      IntraPredSmoothInit_SSE4_1();
+      InverseTransformInit_SSE4_1();
+      LoopFilterInit_SSE4_1();
+      LoopRestorationInit_SSE4_1();
+      MaskBlendInit_SSE4_1();
+      MotionFieldProjectionInit_SSE4_1();
+      MotionVectorSearchInit_SSE4_1();
+      ObmcInit_SSE4_1();
+      SuperResInit_SSE4_1();
+      WarpInit_SSE4_1();
+      WeightMaskInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_SSE4_1();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_AVX2
+    if ((cpu_features & kAVX2) != 0) {
+      ConvolveInit_AVX2();
+      LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_AVX2();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif  // LIBGAV1_ENABLE_AVX2
+#endif  // LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+#if LIBGAV1_ENABLE_NEON
+    AverageBlendInit_NEON();
+    CdefInit_NEON();
+    ConvolveInit_NEON();
+    DistanceWeightedBlendInit_NEON();
+    FilmGrainInit_NEON();
+    IntraEdgeInit_NEON();
+    IntraPredCflInit_NEON();
+    IntraPredDirectionalInit_NEON();
+    IntraPredFilterIntraInit_NEON();
+    IntraPredInit_NEON();
+    IntraPredSmoothInit_NEON();
+    InverseTransformInit_NEON();
+    LoopFilterInit_NEON();
+    LoopRestorationInit_NEON();
+    MaskBlendInit_NEON();
+    MotionFieldProjectionInit_NEON();
+    MotionVectorSearchInit_NEON();
+    ObmcInit_NEON();
+    SuperResInit_NEON();
+    WarpInit_NEON();
+    WeightMaskInit_NEON();
+#endif  // LIBGAV1_ENABLE_NEON
+  });
+}
+
+const Dsp* GetDspTable(int bitdepth) {
+  return dsp_internal::GetWritableDspTable(bitdepth);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
new file mode 100644
index 0000000..fcbac3a
--- /dev/null
+++ b/src/dsp/dsp.h
@@ -0,0 +1,910 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DSP_H_
+#define LIBGAV1_SRC_DSP_DSP_H_
+
+#include <cstddef>  // ptrdiff_t
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+
+#if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS)
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0
+#endif
+
+enum IntraPredictor : uint8_t {
+  kIntraPredictorDcFill,
+  kIntraPredictorDcTop,
+  kIntraPredictorDcLeft,
+  kIntraPredictorDc,
+  kIntraPredictorVertical,
+  kIntraPredictorHorizontal,
+  kIntraPredictorPaeth,
+  kIntraPredictorSmooth,
+  kIntraPredictorSmoothVertical,
+  kIntraPredictorSmoothHorizontal,
+  kNumIntraPredictors
+};
+
+// List of valid 1D transforms.
+enum Transform1D : uint8_t {
+  k1DTransformDct,   // Discrete Cosine Transform.
+  k1DTransformAdst,  // Asymmetric Discrete Sine Transform.
+  k1DTransformIdentity,
+  k1DTransformWht,  // Walsh Hadamard Transform.
+  kNum1DTransforms
+};
+
+// List of valid 1D transform sizes. Not all transforms may be available for all
+// the sizes.
+enum TransformSize1D : uint8_t {
+  k1DTransformSize4,
+  k1DTransformSize8,
+  k1DTransformSize16,
+  k1DTransformSize32,
+  k1DTransformSize64,
+  kNum1DTransformSizes
+};
+
+// The maximum width of the loop filter, fewer pixels may be filtered depending
+// on strength thresholds.
+enum LoopFilterSize : uint8_t {
+  kLoopFilterSize4,
+  kLoopFilterSize6,
+  kLoopFilterSize8,
+  kLoopFilterSize14,
+  kNumLoopFilterSizes
+};
+
+enum : uint8_t {
+  kRow = 0,
+  kColumn = 1,
+};
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const IntraPredictor predictor) {
+  switch (predictor) {
+    case kIntraPredictorDcFill:
+      return "kIntraPredictorDcFill";
+    case kIntraPredictorDcTop:
+      return "kIntraPredictorDcTop";
+    case kIntraPredictorDcLeft:
+      return "kIntraPredictorDcLeft";
+    case kIntraPredictorDc:
+      return "kIntraPredictorDc";
+    case kIntraPredictorVertical:
+      return "kIntraPredictorVertical";
+    case kIntraPredictorHorizontal:
+      return "kIntraPredictorHorizontal";
+    case kIntraPredictorPaeth:
+      return "kIntraPredictorPaeth";
+    case kIntraPredictorSmooth:
+      return "kIntraPredictorSmooth";
+    case kIntraPredictorSmoothVertical:
+      return "kIntraPredictorSmoothVertical";
+    case kIntraPredictorSmoothHorizontal:
+      return "kIntraPredictorSmoothHorizontal";
+    case kNumIntraPredictors:
+      return "kNumIntraPredictors";
+  }
+  abort();
+}
+
+inline const char* ToString(const Transform1D transform) {
+  switch (transform) {
+    case k1DTransformDct:
+      return "k1DTransformDct";
+    case k1DTransformAdst:
+      return "k1DTransformAdst";
+    case k1DTransformIdentity:
+      return "k1DTransformIdentity";
+    case k1DTransformWht:
+      return "k1DTransformWht";
+    case kNum1DTransforms:
+      return "kNum1DTransforms";
+  }
+  abort();
+}
+
+inline const char* ToString(const TransformSize1D transform_size) {
+  switch (transform_size) {
+    case k1DTransformSize4:
+      return "k1DTransformSize4";
+    case k1DTransformSize8:
+      return "k1DTransformSize8";
+    case k1DTransformSize16:
+      return "k1DTransformSize16";
+    case k1DTransformSize32:
+      return "k1DTransformSize32";
+    case k1DTransformSize64:
+      return "k1DTransformSize64";
+    case kNum1DTransformSizes:
+      return "kNum1DTransformSizes";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopFilterSize filter_size) {
+  switch (filter_size) {
+    case kLoopFilterSize4:
+      return "kLoopFilterSize4";
+    case kLoopFilterSize6:
+      return "kLoopFilterSize6";
+    case kLoopFilterSize8:
+      return "kLoopFilterSize8";
+    case kLoopFilterSize14:
+      return "kLoopFilterSize14";
+    case kNumLoopFilterSizes:
+      return "kNumLoopFilterSizes";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopFilterType filter_type) {
+  switch (filter_type) {
+    case kLoopFilterTypeVertical:
+      return "kLoopFilterTypeVertical";
+    case kLoopFilterTypeHorizontal:
+      return "kLoopFilterTypeHorizontal";
+    case kNumLoopFilterTypes:
+      return "kNumLoopFilterTypes";
+  }
+  abort();
+}
+
+//------------------------------------------------------------------------------
+// Intra predictors. Section 7.11.2.
+// These require access to one or both of the top row and left column. Some may
+// access the top-left (top[-1]), top-right (top[width+N]), bottom-left
+// (left[height+N]) or upper-left (left[-1]).
+
+// Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11),
+// 7.11.2.5, 7.11.2.6.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. top-left and bottom-left may be accessed.
+using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+                                    const void* top, const void* left);
+using IntraPredictorFuncs =
+    IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors];
+
+// Directional intra predictor function signature, zone 1 (0 < angle < 90).
+// Section 7.11.2.4 (#7).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |width| and |height| give the dimensions of the block.
+// |xstep| is the scaled starting index to |top| from
+// kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether
+// |top| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. top-right
+// is accessed.
+using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
+                                                    const void* top, int width,
+                                                    int height, int xstep,
+                                                    bool upsampled_top);
+
+// Directional intra predictor function signature, zone 2 (90 < angle < 180).
+// Section 7.11.2.4 (#8).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left of
+// |dst|. |width| and |height| give the dimensions of the block. |xstep| and
+// |ystep| are the scaled starting index to |top| and |left|, respectively,
+// from kDirectionalIntraPredictorDerivative. |upsampled_top| and
+// |upsampled_left| indicate whether |top| and |left| have been upsampled as
+// described in '7.11.2.11. Intra edge upsample process'. This can occur in
+// cases with |width| + |height| <= 16. top-left and upper-left are accessed,
+// up to [-2] in each if |upsampled_top/left| are set.
+using DirectionalIntraPredictorZone2Func = void (*)(
+    void* dst, ptrdiff_t stride, const void* top, const void* left, int width,
+    int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left);
+
+// Directional intra predictor function signature, zone 3 (180 < angle < 270).
+// Section 7.11.2.4 (#9).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |left| is an aligned vector of the
+// column to the left of |dst|. |width| and |height| give the dimensions of the
+// block. |ystep| is the scaled starting index to |left| from
+// kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether
+// |left| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. bottom-left
+// is accessed.
+using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
+                                                    const void* left, int width,
+                                                    int height, int ystep,
+                                                    bool upsampled_left);
+
+// Filter intra predictor function signature. Section 7.11.2.3.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. |width| and |height| are the size of the block in pixels.
+using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+                                          const void* top, const void* left,
+                                          FilterIntraPredictor pred, int width,
+                                          int height);
+
+//------------------------------------------------------------------------------
+// Chroma from Luma (Cfl) prediction. Section 7.11.5.
+
+// Chroma from Luma (Cfl) intra prediction function signature. |dst| is an
+// unaligned pointer to the output block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |luma| contains subsampled luma pixels with 3
+// fractional bits of precision. |alpha| is the signed Cfl alpha value for the
+// appropriate plane.
+using CflIntraPredictorFunc = void (*)(
+    void* dst, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha);
+using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes];
+
+// Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned
+// pointer to the output block. |src| is an unaligned pointer to the input
+// block. Pixel size is determined by bitdepth with |stride| given in bytes.
+using CflSubsamplerFunc =
+    void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+             int max_luma_width, int max_luma_height, const void* source,
+             ptrdiff_t stride);
+using CflSubsamplerFuncs =
+    CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes];
+
+//------------------------------------------------------------------------------
+// Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4.
+
+// Intra edge filter function signature. |buffer| is a pointer to the top_row or
+// left_column that needs to be filtered. Typically the -1'th index of |top_row|
+// and |left_column| need to be filtered as well, so the caller can merely pass
+// the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be filtered. |strength| is the
+// filter strength. Section 7.11.2.12 in the spec.
+using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength);
+
+// Intra edge upsampler function signature. |buffer| is a pointer to the top_row
+// or left_column that needs to be upsampled. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be upsampled; valid values are:
+// 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of
+// the |buffer|. Section 7.11.2.11 in the spec.
+using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size);
+
+//------------------------------------------------------------------------------
+// Inverse transform add function signature.
+//
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the destination frame
+// for the transform type and block size |tx_size| starting at position
+// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D.
+// |adjusted_tx_height| is the number of rows to process based on the non-zero
+// coefficient count in the block. It will be 1 (non-zero coefficient count ==
+// 1), 4 or a multiple of 8 up to 32 or the original transform height,
+// whichever is less.
+using InverseTransformAddFunc = void (*)(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* src_buffer, int start_x,
+                                         int start_y, void* dst_frame);
+// The final dimension holds row and column transforms indexed with kRow and
+// kColumn.
+using InverseTransformAddFuncs =
+    InverseTransformAddFunc[kNum1DTransforms][kNum1DTransformSizes][2];
+
+//------------------------------------------------------------------------------
+// Post processing.
+
+// Loop filter function signature. Section 7.14.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes.
+using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh,
+                                int inner_thresh, int hev_thresh);
+using LoopFilterFuncs =
+    LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes];
+
+// Cdef direction function signature. Section 7.15.2.
+// |src| is a pointer to the source block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |direction| and |variance| are output
+// parameters and must not be nullptr.
+using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
+                                   uint8_t* direction, int* variance);
+
+// Cdef filtering function signature. Section 7.15.3.
+// |source| is a pointer to the input block padded with kCdefLargeValue if at a
+// frame border. |source_stride| is given in units of uint16_t.
+// |block_width|, |block_height| are the width/height of the input block.
+// |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
+// parameters.
+// |direction| is the filtering direction.
+// |dest| is the output buffer. |dest_stride| is given in bytes.
+using CdefFilteringFunc = void (*)(const uint16_t* source,
+                                   ptrdiff_t source_stride, int block_height,
+                                   int primary_strength, int secondary_strength,
+                                   int damping, int direction, void* dest,
+                                   ptrdiff_t dest_stride);
+
+// The first index is block width: [0]: 4, [1]: 8. The second is based on
+// non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]:
+// |primary_strength| only, [2]: |secondary_strength| only.
+using CdefFilteringFuncs = CdefFilteringFunc[2][3];
+
+// Upscaling coefficients function signature. Section 7.16.
+// This is an auxiliary function for SIMD optimizations and has no corresponding
+// C function. Different SIMD versions may have different outputs. So it must
+// pair with the corresponding version of SuperResFunc.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+// |coefficients| is the upscale filter used by each pixel in a row.
+using SuperResCoefficientsFunc = void (*)(int upscaled_width,
+                                          int initial_subpixel_x, int step,
+                                          void* coefficients);
+
+// Upscaling process function signature. Section 7.16.
+// |coefficients| is the upscale filter used by each pixel in a row. It is not
+// used by the C function.
+// |source| is the input frame buffer. It will be line extended.
+// |dest| is the output buffer.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |height| is the height of the block to be processed.
+// |downscaled_width| is the width of the input frame.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+using SuperResFunc = void (*)(const void* coefficients, void* source,
+                              ptrdiff_t stride, int height,
+                              int downscaled_width, int upscaled_width,
+                              int initial_subpixel_x, int step, void* dest);
+
+// Loop restoration function signature. Sections 7.16, 7.17.
+// |restoration_info| contains loop restoration information, such as filter
+// type, strength.
+// |source| is the input frame buffer, which is deblocked and cdef filtered.
+// |top_border| and |bottom_border| are the top and bottom borders.
+// |dest| is the output.
+// |stride| is given in pixels, and shared by |source|, |top_border|,
+// |bottom_border| and |dest|.
+// |restoration_buffer| contains buffers required for self guided filter and
+// wiener filter. They must be initialized before calling.
+using LoopRestorationFunc = void (*)(
+    const RestorationUnitInfo& restoration_info, const void* source,
+    const void* top_border, const void* bottom_border, ptrdiff_t stride,
+    int width, int height, RestorationBuffer* restoration_buffer, void* dest);
+
+// Index 0 is Wiener Filter.
+// Index 1 is Self Guided Restoration Filter.
+// This can be accessed as LoopRestorationType - 2.
+using LoopRestorationFuncs = LoopRestorationFunc[2];
+
+// Convolve function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
+                              int horizontal_filter_index,
+                              int vertical_filter_index,
+                              int horizontal_filter_id, int vertical_filter_id,
+                              int width, int height, void* prediction,
+                              ptrdiff_t pred_stride);
+
+// Convolve functions signature. Each points to one convolve function with
+// a specific setting:
+// ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter]
+// [has_horizontal_filter].
+// If is_compound is false, the prediction is clipped to Pixel.
+// If is_compound is true, the range of prediction is:
+//   8bpp:  [-5132,  9212] (int16_t)
+//   10bpp: [ 3988, 61532] (uint16_t)
+//   12bpp: [ 3974, 61559] (uint16_t)
+// See src/dsp/convolve.cc
+using ConvolveFuncs = ConvolveFunc[2][2][2][2];
+
+// Convolve + scale function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
+// |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+using ConvolveScaleFunc = void (*)(const void* reference,
+                                   ptrdiff_t reference_stride,
+                                   int horizontal_filter_index,
+                                   int vertical_filter_index, int subpixel_x,
+                                   int subpixel_y, int step_x, int step_y,
+                                   int width, int height, void* prediction,
+                                   ptrdiff_t pred_stride);
+
+// Convolve functions signature for scaling version.
+// 0: single predictor. 1: compound predictor.
+using ConvolveScaleFuncs = ConvolveScaleFunc[2];
+
+// Weight mask function signature. Section 7.11.3.12.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the prediction width and height.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |mask| is the output buffer. |mask_stride| is the output buffer stride.
+using WeightMaskFunc = void (*)(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride);
+
+// Weight mask functions signature. The dimensions (in order) are:
+//   * Width index (4 => 0, 8 => 1, 16 => 2 and so on).
+//   * Height index (4 => 0, 8 => 1, 16 => 2 and so on).
+//   * mask_is_inverse.
+using WeightMaskFuncs = WeightMaskFunc[6][6][2];
+
+// Average blending function signature.
+// Two predictors are averaged to generate the output.
+// Input predictor values are int16_t. Output type is uint8_t, with actual
+// range of Pixel value.
+// Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE).
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+using AverageBlendFunc = void (*)(const void* prediction_0,
+                                  const void* prediction_1, int width,
+                                  int height, void* dest,
+                                  ptrdiff_t dest_stride);
+
+// Distance weighted blending function signature.
+// Weights are generated in Section 7.11.3.15.
+// Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE).
+// This function takes two blocks (inter frame prediction) and produces a
+// weighted output.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |weight_0| is the weight for the first block. It is derived from the relative
+// distance of the first reference frame and the current frame.
+// |weight_1| is the weight for the second block. It is derived from the
+// relative distance of the second reference frame and the current frame.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
+                                           const void* prediction_1,
+                                           uint8_t weight_0, uint8_t weight_1,
+                                           int width, int height, void* dest,
+                                           ptrdiff_t dest_stride);
+
+// Mask blending function signature. Section 7.11.3.14.
+// This function takes two blocks and produces a blended output stored into the
+// output block |dest|. The blending is a weighted average process, controlled
+// by values of the mask.
+// |prediction_0| is the first input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the inter frame prediction. It is
+// int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// The stride for |prediction_0| is equal to |width|.
+// |prediction_1| is the second input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the intra frame prediction and uses
+// Pixel values. It is only used for intra frame prediction when bitdepth >= 10.
+// It is int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// |prediction_stride_1| is the stride, given in units of [u]int16_t. When
+// |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is
+// equal to |width|.
+// |mask| is an integer array, whose value indicates the weight of the blending.
+// |mask_stride| is corresponding stride.
+// |width|, |height| are the same for both input blocks.
+// If it's inter_intra (or wedge_inter_intra), the valid range of block size is
+// [8x8, 32x32]. Otherwise (including difference weighted prediction and
+// compound average prediction), the valid range is [8x8, 128x128].
+// If there's subsampling, the corresponding width and height are halved for
+// chroma planes.
+// |subsampling_x|, |subsampling_y| are the subsampling factors.
+// |is_inter_intra| stands for the prediction mode. If it is true, one of the
+// prediction blocks is from intra prediction of current frame. Otherwise, two
+// prediction blocks are both inter frame predictions.
+// |is_wedge_inter_intra| indicates if the mask is for the wedge prediction.
+// |dest| is the output block.
+// |dest_stride| is the corresponding stride for dest.
+using MaskBlendFunc = void (*)(const void* prediction_0,
+                               const void* prediction_1,
+                               ptrdiff_t prediction_stride_1,
+                               const uint8_t* mask, ptrdiff_t mask_stride,
+                               int width, int height, void* dest,
+                               ptrdiff_t dest_stride);
+
+// Mask blending functions signature. Each points to one function with
+// a specific setting:
+// MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra].
+using MaskBlendFuncs = MaskBlendFunc[3][2];
+
+// This function is similar to the MaskBlendFunc. It is only used when
+// |is_inter_intra| is true and |bitdepth| == 8.
+// |prediction_[01]| are Pixel values (uint8_t).
+// |prediction_1| is also the output buffer.
+using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0,
+                                             uint8_t* prediction_1,
+                                             ptrdiff_t prediction_stride_1,
+                                             const uint8_t* mask,
+                                             ptrdiff_t mask_stride, int width,
+                                             int height);
+
+// InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra
+// is false, the function at index 0 must be used. Otherwise, the function at
+// index subsampling_x + subsampling_y must be used.
+using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3];
+
+// Obmc (overlapped block motion compensation) blending function signature.
+// Section 7.11.3.10.
+// This function takes two blocks and produces a blended output stored into the
+// first input block. The blending is a weighted average process, controlled by
+// values of the mask.
+// Obmc is not a compound mode. It is different from other compound blending,
+// in terms of precision. The current block is computed using convolution with
+// clipping to the range of pixel values. Its above and left blocks are also
+// clipped. Therefore obmc blending process doesn't need to clip the output.
+// |prediction| is the first input block, which will be overwritten.
+// |prediction_stride| is the stride, given in bytes.
+// |width|, |height| are the same for both input blocks.
+// |obmc_prediction| is the second input block.
+// |obmc_prediction_stride| is its stride, given in bytes.
+using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride,
+                               int width, int height,
+                               const void* obmc_prediction,
+                               ptrdiff_t obmc_prediction_stride);
+using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections];
+
+// Warp function signature. Section 7.11.3.5.
+// This function applies warp filtering for each 8x8 block inside the current
+// coding block. The filtering process is similar to 2d convolve filtering.
+// The horizontal filter is applied followed by the vertical filter.
+// The function has to calculate corresponding pixel positions before and
+// after warping.
+// |source| is the input reference frame buffer.
+// |source_stride|, |source_width|, |source_height| are corresponding frame
+// stride, width, and height. |source_stride| is given in bytes.
+// |warp_params| is the matrix of warp motion: warp_params[i] = mN.
+//         [x'     (m2 m3 m0   [x
+//     z .  y'  =   m4 m5 m1 *  y
+//          1]      m6 m7 1)    1]
+// |subsampling_x/y| is the current frame's plane subsampling factor.
+// |block_start_x| and |block_start_y| are the starting position the current
+// coding block.
+// |block_width| and |block_height| are width and height of the current coding
+// block. |block_width| and |block_height| are at least 8.
+// |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the
+// comments in the definition of struct GlobalMotion for the range of their
+// values.
+// |dest| is the output buffer of type Pixel. The output values are clipped to
+// Pixel values.
+// |dest_stride| is the stride, in units of bytes.
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsVertical &
+// kInterRoundBitsVertical12bpp will be used.
+//
+// NOTE: WarpFunc assumes the source frame has left, right, top, and bottom
+// borders that extend the frame boundary pixels.
+// * The left and right borders must be at least 13 pixels wide. In addition,
+//   Warp_NEON() may read up to 14 bytes after a row in the |source| buffer.
+//   Therefore, there must be at least one extra padding byte after the right
+//   border of the last row in the source buffer.
+// * The top and bottom borders must be at least 13 pixels high.
+using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride,
+                          int source_width, int source_height,
+                          const int* warp_params, int subsampling_x,
+                          int subsampling_y, int block_start_x,
+                          int block_start_y, int block_width, int block_height,
+                          int16_t alpha, int16_t beta, int16_t gamma,
+                          int16_t delta, void* dest, ptrdiff_t dest_stride);
+
+// Warp for compound predictions. Section 7.11.3.5.
+// Similar to WarpFunc, but |dest| is a uint16_t predictor buffer,
+// |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical|
+// is always 7 (kCompoundInterRoundBitsVertical).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsCompondVertical will be used.
+using WarpCompoundFunc = WarpFunc;
+
+constexpr int kNumAutoRegressionLags = 4;
+// Applies an auto-regressive filter to the white noise in |luma_grain_buffer|.
+// Section 7.18.3.3, second code block
+// |params| are parameters read from frame header, mainly providing
+// auto_regression_coeff_y for the filter and auto_regression_shift to right
+// shift the filter sum by. Note: This method assumes
+// params.auto_regression_coeff_lag is not 0. Do not call this method if
+// params.auto_regression_coeff_lag is 0.
+using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+                                        void* luma_grain_buffer);
+// Function index is auto_regression_coeff_lag - 1.
+using LumaAutoRegressionFuncs =
+    LumaAutoRegressionFunc[kNumAutoRegressionLags - 1];
+
+// Applies an auto-regressive filter to the white noise in u_grain and v_grain.
+// Section 7.18.3.3, third code block
+// The |luma_grain_buffer| provides samples that are added to the autoregressive
+// sum when num_y_points > 0.
+// |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise
+// that were generated from the stored Gaussian sequence, and are overwritten
+// with the results of the autoregressive filter. |params| are parameters read
+// from frame header, mainly providing auto_regression_coeff_u and
+// auto_regression_coeff_v for each chroma plane's filter, and
+// auto_regression_shift to right shift the filter sums by.
+using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+                                          const void* luma_grain_buffer,
+                                          int subsampling_x, int subsampling_y,
+                                          void* u_grain_buffer,
+                                          void* v_grain_buffer);
+using ChromaAutoRegressionFuncs =
+    ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags];
+
+// Build an image-wide "stripe" of grain noise for every 32 rows in the image.
+// Section 7.18.3.5, first code block.
+// Each 32x32 luma block is copied at a random offset specified via
+// |grain_seed| from the grain template produced by autoregression, and the same
+// is done for chroma grains, subject to subsampling.
+// |width| and |height| are the dimensions of the overall image.
+// |noise_stripes_buffer| points to an Array2DView with one row for each stripe.
+// Because this function treats all planes identically and independently, it is
+// simplified to take one grain buffer at a time. This means duplicating some
+// random number generations, but that work can be reduced in other ways.
+using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer,
+                                           int grain_seed, int width,
+                                           int height, int subsampling_x,
+                                           int subsampling_y,
+                                           void* noise_stripes_buffer);
+using ConstructNoiseStripesFuncs =
+    ConstructNoiseStripesFunc[/*overlap_flag*/ 2];
+
+// Compute the one or two overlap rows for each stripe copied to the noise
+// image.
+// Section 7.18.3.5, second code block. |width| and |height| are the
+// dimensions of the overall image. |noise_stripes_buffer| points to an
+// Array2DView with one row for each stripe. |noise_image_buffer| points to an
+// Array2D containing the allocated plane for this frame. Because this function
+// treats all planes identically and independently, it is simplified to take one
+// grain buffer at a time.
+using ConstructNoiseImageOverlapFunc =
+    void (*)(const void* noise_stripes_buffer, int width, int height,
+             int subsampling_x, int subsampling_y, void* noise_image_buffer);
+
+// Populate a scaling lookup table with interpolated values of a piecewise
+// linear function where values in |point_value| are mapped to the values in
+// |point_scaling|.
+// |num_points| can be between 0 and 15. When 0, the lookup table is set to
+// zero.
+// |point_value| and |point_scaling| have |num_points| valid elements.
+using InitializeScalingLutFunc = void (*)(
+    int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
+    uint8_t scaling_lut[kScalingLookupTableSize]);
+
+// Blend noise with image. Section 7.18.3.5, third code block.
+// |width| is the width of each row, while |height| is how many rows to compute.
+// |start_height| is an offset for the noise image, to support multithreading.
+// |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these
+// functions, according to the code in the spec.
+// |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded
+// frame. They are blended with the film grain noise and written to
+// |dest_plane_y| and |dest_plane_uv| as final output for display.
+// source_plane_* and dest_plane_* may point to the same buffer, in which case
+// the film grain noise is added in place.
+// |scaling_lut_y|  and |scaling_lut| represent a piecewise linear mapping from
+// the frame's raw pixel value, to a scaling factor for the noise sample.
+// |scaling_shift| is applied as a right shift after scaling, so that scaling
+// down is possible. It is found in FilmGrainParams, but supplied directly to
+// BlendNoiseWithImageLumaFunc because it's the only member used.
+using BlendNoiseWithImageLumaFunc =
+    void (*)(const void* noise_image_ptr, int min_value, int max_value,
+             int scaling_shift, int width, int height, int start_height,
+             const uint8_t scaling_lut_y[kScalingLookupTableSize],
+             const void* source_plane_y, ptrdiff_t source_stride_y,
+             void* dest_plane_y, ptrdiff_t dest_stride_y);
+
+using BlendNoiseWithImageChromaFunc = void (*)(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_value, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv);
+
+using BlendNoiseWithImageChromaFuncs =
+    BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2];
+
+//------------------------------------------------------------------------------
+
+struct FilmGrainFuncs {
+  LumaAutoRegressionFuncs luma_auto_regression;
+  ChromaAutoRegressionFuncs chroma_auto_regression;
+  ConstructNoiseStripesFuncs construct_noise_stripes;
+  ConstructNoiseImageOverlapFunc construct_noise_image_overlap;
+  InitializeScalingLutFunc initialize_scaling_lut;
+  BlendNoiseWithImageLumaFunc blend_noise_luma;
+  BlendNoiseWithImageChromaFuncs blend_noise_chroma;
+};
+
+// Motion field projection function signature. Section 7.9.
+// |reference_info| provides reference information for motion field projection.
+// |reference_to_current_with_sign| is the precalculated reference frame id
+// distance from current frame.
+// |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
+// |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile.
+// |x8_start| and |x8_end| are the start and end 8x8 columns of the current
+// tile.
+// |motion_field| is the output which saves the projected motion field
+// information.
+using MotionFieldProjectionKernelFunc = void (*)(
+    const ReferenceInfo& reference_info, int reference_to_current_with_sign,
+    int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
+    TemporalMotionField* motion_field);
+
+// Compound temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offsets| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the set of projected motion vectors.
+using MvProjectionCompoundFunc = void (*)(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], int count,
+    CompoundMotionVector* candidate_mvs);
+
+// Single temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offset| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the set of projected motion vectors.
+using MvProjectionSingleFunc = void (*)(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    int reference_offset, int count, MotionVector* candidate_mvs);
+
+struct Dsp {
+  AverageBlendFunc average_blend;
+  CdefDirectionFunc cdef_direction;
+  CdefFilteringFuncs cdef_filters;
+  CflIntraPredictorFuncs cfl_intra_predictors;
+  CflSubsamplerFuncs cfl_subsamplers;
+  ConvolveFuncs convolve;
+  ConvolveScaleFuncs convolve_scale;
+  DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
+  DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
+  DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
+  DistanceWeightedBlendFunc distance_weighted_blend;
+  FilmGrainFuncs film_grain;
+  FilterIntraPredictorFunc filter_intra_predictor;
+  InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
+  IntraEdgeFilterFunc intra_edge_filter;
+  IntraEdgeUpsamplerFunc intra_edge_upsampler;
+  IntraPredictorFuncs intra_predictors;
+  InverseTransformAddFuncs inverse_transforms;
+  LoopFilterFuncs loop_filters;
+  LoopRestorationFuncs loop_restorations;
+  MaskBlendFuncs mask_blend;
+  MotionFieldProjectionKernelFunc motion_field_projection_kernel;
+  MvProjectionCompoundFunc mv_projection_compound[3];
+  MvProjectionSingleFunc mv_projection_single[3];
+  ObmcBlendFuncs obmc_blend;
+  SuperResCoefficientsFunc super_res_coefficients;
+  SuperResFunc super_res;
+  WarpCompoundFunc warp_compound;
+  WarpFunc warp;
+  WeightMaskFuncs weight_mask;
+};
+
+// Initializes function pointers based on build config and runtime
+// environment. Must be called once before first use. This function is
+// thread-safe.
+void DspInit();
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist.
+const Dsp* GetDspTable(int bitdepth);
+
+}  // namespace dsp
+
+namespace dsp_internal {
+
+// Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
+// functions if /arch:AVX2 is used across all sources.
+#if !LIBGAV1_TARGETING_AVX2 && \
+    (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
+#undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
+#endif
+
+// Returns true if a more highly optimized version of |func| is not defined for
+// the associated bitdepth or if it is forcibly enabled with
+// LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
+// to the LIBGAV1_Dsp<bitdepth>bpp_|func| define in the header file associated
+// with the module.
+// |func| is one of:
+//   - FunctionName, e.g., SelfGuidedFilter.
+//   - [sub-table-index1][...-indexN] e.g.,
+//     TransformSize4x4_IntraPredictorDc. The indices correspond to enum values
+//     used as lookups with leading 'k' removed.
+//
+//  NEON support is the only extension available for ARM and it is always
+//  required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
+//  true and can be omitted.
+#define DSP_ENABLED_8BPP_AVX2(func)    \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_10BPP_AVX2(func)   \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_8BPP_SSE4_1(func)  \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
+#define DSP_ENABLED_10BPP_SSE4_1(func) \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist. This version is meant for use by test or dsp/*Init() functions only.
+dsp::Dsp* GetWritableDspTable(int bitdepth);
+
+}  // namespace dsp_internal
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_DSP_H_
diff --git a/src/dsp/film_grain.cc b/src/dsp/film_grain.cc
new file mode 100644
index 0000000..41d1dd0
--- /dev/null
+++ b/src/dsp/film_grain.cc
@@ -0,0 +1,870 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Making this a template function prevents it from adding to code size when it
+// is not placed in the DSP table. Most functions in the dsp directory change
+// behavior by bitdepth, but because this one doesn't, it receives a dummy
+// parameter with one enforced value, ensuring only one copy is made.
+template <int singleton>
+void InitializeScalingLookupTable_C(
+    int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
+    uint8_t scaling_lut[kScalingLookupTableSize]) {
+  static_assert(singleton == 0,
+                "Improper instantiation of InitializeScalingLookupTable_C. "
+                "There should be only one copy of this function.");
+  if (num_points == 0) {
+    memset(scaling_lut, 0, sizeof(scaling_lut[0]) * kScalingLookupTableSize);
+    return;
+  }
+  static_assert(sizeof(scaling_lut[0]) == 1, "");
+  memset(scaling_lut, point_scaling[0], point_value[0]);
+  for (int i = 0; i < num_points - 1; ++i) {
+    const int delta_y = point_scaling[i + 1] - point_scaling[i];
+    const int delta_x = point_value[i + 1] - point_value[i];
+    const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+    for (int x = 0; x < delta_x; ++x) {
+      const int v = point_scaling[i] + ((x * delta + 32768) >> 16);
+      assert(v >= 0 && v <= UINT8_MAX);
+      scaling_lut[point_value[i] + x] = v;
+    }
+  }
+  const uint8_t last_point_value = point_value[num_points - 1];
+  memset(&scaling_lut[last_point_value], point_scaling[num_points - 1],
+         kScalingLookupTableSize - last_point_value);
+}
+
+// Section 7.18.3.5.
+// Performs a piecewise linear interpolation into the scaling table.
+template <int bitdepth>
+int ScaleLut(const uint8_t scaling_lut[kScalingLookupTableSize], int index) {
+  const int shift = bitdepth - 8;
+  const int quotient = index >> shift;
+  const int remainder = index - (quotient << shift);
+  if (bitdepth == 8) {
+    assert(quotient < kScalingLookupTableSize);
+    return scaling_lut[quotient];
+  }
+  assert(quotient + 1 < kScalingLookupTableSize);
+  const int start = scaling_lut[quotient];
+  const int end = scaling_lut[quotient + 1];
+  return start + RightShiftWithRounding((end - start) * remainder, shift);
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType>
+void ApplyAutoRegressiveFilterToLumaGrain_C(const FilmGrainParams& params,
+                                            void* luma_grain_buffer) {
+  auto* luma_grain = static_cast<GrainType*>(luma_grain_buffer);
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int auto_regression_coeff_lag = params.auto_regression_coeff_lag;
+  assert(auto_regression_coeff_lag > 0 && auto_regression_coeff_lag <= 3);
+  // A pictorial representation of the auto-regressive filter for various values
+  // of auto_regression_coeff_lag. The letter 'O' represents the current sample.
+  // (The filter always operates on the current sample with filter
+  // coefficient 1.) The letters 'X' represent the neighboring samples that the
+  // filter operates on.
+  //
+  // auto_regression_coeff_lag == 3:
+  //   X X X X X X X
+  //   X X X X X X X
+  //   X X X X X X X
+  //   X X X O
+  // auto_regression_coeff_lag == 2:
+  //     X X X X X
+  //     X X X X X
+  //     X X O
+  // auto_regression_coeff_lag == 1:
+  //       X X X
+  //       X O
+  // auto_regression_coeff_lag == 0:
+  //         O
+  //
+  // Note that if auto_regression_coeff_lag is 0, the filter is the identity
+  // filter and therefore can be skipped. This implementation assumes it is not
+  // called in that case.
+  const int shift = params.auto_regression_shift;
+  for (int y = kAutoRegressionBorder; y < kLumaHeight; ++y) {
+    for (int x = kAutoRegressionBorder; x < kLumaWidth - kAutoRegressionBorder;
+         ++x) {
+      int sum = 0;
+      int pos = 0;
+      int delta_row = -auto_regression_coeff_lag;
+      // The last iteration (delta_row == 0) is shorter and is handled
+      // separately.
+      do {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          const int coeff = params.auto_regression_coeff_y[pos];
+          sum += luma_grain[(y + delta_row) * kLumaWidth + (x + delta_column)] *
+                 coeff;
+          ++pos;
+        } while (++delta_column <= auto_regression_coeff_lag);
+      } while (++delta_row < 0);
+      // Last iteration: delta_row == 0.
+      {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          const int coeff = params.auto_regression_coeff_y[pos];
+          sum += luma_grain[y * kLumaWidth + (x + delta_column)] * coeff;
+          ++pos;
+        } while (++delta_column < 0);
+      }
+      luma_grain[y * kLumaWidth + x] = Clip3(
+          luma_grain[y * kLumaWidth + x] + RightShiftWithRounding(sum, shift),
+          grain_min, grain_max);
+    }
+  }
+}
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+          bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_C(const FilmGrainParams& params,
+                                               const void* luma_grain_buffer,
+                                               int subsampling_x,
+                                               int subsampling_y,
+                                               void* u_grain_buffer,
+                                               void* v_grain_buffer) {
+  static_assert(
+      auto_regression_coeff_lag >= 0 && auto_regression_coeff_lag <= 3,
+      "Unsupported autoregression lag for chroma.");
+  const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int grain_max = GetGrainMax<bitdepth>();
+  auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+  auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+  const int shift = params.auto_regression_shift;
+  const int chroma_height =
+      (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+  const int chroma_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  for (int y = kAutoRegressionBorder; y < chroma_height; ++y) {
+    const int luma_y =
+        ((y - kAutoRegressionBorder) << subsampling_y) + kAutoRegressionBorder;
+    for (int x = kAutoRegressionBorder;
+         x < chroma_width - kAutoRegressionBorder; ++x) {
+      int sum_u = 0;
+      int sum_v = 0;
+      int pos = 0;
+      int delta_row = -auto_regression_coeff_lag;
+      do {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          if (delta_row == 0 && delta_column == 0) {
+            break;
+          }
+          const int coeff_u = params.auto_regression_coeff_u[pos];
+          const int coeff_v = params.auto_regression_coeff_v[pos];
+          sum_u +=
+              u_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+              coeff_u;
+          sum_v +=
+              v_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+              coeff_v;
+          ++pos;
+        } while (++delta_column <= auto_regression_coeff_lag);
+      } while (++delta_row <= 0);
+      if (use_luma) {
+        int luma = 0;
+        const int luma_x = ((x - kAutoRegressionBorder) << subsampling_x) +
+                           kAutoRegressionBorder;
+        int i = 0;
+        do {
+          int j = 0;
+          do {
+            luma += luma_grain[(luma_y + i) * kLumaWidth + (luma_x + j)];
+          } while (++j <= subsampling_x);
+        } while (++i <= subsampling_y);
+        luma = SubsampledValue(luma, subsampling_x + subsampling_y);
+        const int coeff_u = params.auto_regression_coeff_u[pos];
+        const int coeff_v = params.auto_regression_coeff_v[pos];
+        sum_u += luma * coeff_u;
+        sum_v += luma * coeff_v;
+      }
+      u_grain[y * chroma_width + x] = Clip3(
+          u_grain[y * chroma_width + x] + RightShiftWithRounding(sum_u, shift),
+          grain_min, grain_max);
+      v_grain[y * chroma_width + x] = Clip3(
+          v_grain[y * chroma_width + x] + RightShiftWithRounding(sum_v, shift),
+          grain_min, grain_max);
+    }
+  }
+}
+
+// This implementation is for the condition overlap_flag == false.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripes_C(const void* grain_buffer, int grain_seed,
+                             int width, int height, int subsampling_x,
+                             int subsampling_y, void* noise_stripes_buffer) {
+  auto* noise_stripes =
+      static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+  const auto* grain = static_cast<const GrainType*>(grain_buffer);
+  const int half_width = DivideBy2(width + 1);
+  const int half_height = DivideBy2(height + 1);
+  assert(half_width > 0);
+  assert(half_height > 0);
+  static_assert(kLumaWidth == kMaxChromaWidth,
+                "kLumaWidth width should be equal to kMaxChromaWidth");
+  const int grain_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  constexpr int kNoiseStripeHeight = 34;
+  int luma_num = 0;
+  int y = 0;
+  do {
+    GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+    uint16_t seed = grain_seed;
+    seed ^= ((luma_num * 37 + 178) & 255) << 8;
+    seed ^= ((luma_num * 173 + 105) & 255);
+    int x = 0;
+    do {
+      const int rand = GetFilmGrainRandomNumber(8, &seed);
+      const int offset_x = rand >> 4;
+      const int offset_y = rand & 15;
+      const int plane_offset_x =
+          (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+      const int plane_offset_y =
+          (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+      int i = 0;
+      do {
+        // Section 7.18.3.5 says:
+        //   noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+        //   wide (a few additional samples across are actually written to
+        //   the array, but these are never read) ...
+        //
+        // Note: The warning in the parentheses also applies to
+        // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+        //
+        // Writes beyond the width of each row could happen below. To
+        // prevent those writes, we clip the number of pixels to copy against
+        // the remaining width.
+        // TODO(petersonab): Allocate aligned stripes with extra width to cover
+        // the size of the final stripe block, then remove this call to min.
+        const int copy_size =
+            std::min(kNoiseStripeHeight >> subsampling_x,
+                     plane_width - (x << (1 - subsampling_x)));
+        memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x))],
+               &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+               copy_size * sizeof(noise_stripe[0]));
+      } while (++i < (kNoiseStripeHeight >> subsampling_y));
+      x += 16;
+    } while (x < half_width);
+
+    ++luma_num;
+    y += 16;
+  } while (y < half_height);
+}
+
+// This implementation is for the condition overlap_flag == true.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripesWithOverlap_C(const void* grain_buffer,
+                                        int grain_seed, int width, int height,
+                                        int subsampling_x, int subsampling_y,
+                                        void* noise_stripes_buffer) {
+  auto* noise_stripes =
+      static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+  const auto* grain = static_cast<const GrainType*>(grain_buffer);
+  const int half_width = DivideBy2(width + 1);
+  const int half_height = DivideBy2(height + 1);
+  assert(half_width > 0);
+  assert(half_height > 0);
+  static_assert(kLumaWidth == kMaxChromaWidth,
+                "kLumaWidth width should be equal to kMaxChromaWidth");
+  const int grain_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  constexpr int kNoiseStripeHeight = 34;
+  int luma_num = 0;
+  int y = 0;
+  do {
+    GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+    uint16_t seed = grain_seed;
+    seed ^= ((luma_num * 37 + 178) & 255) << 8;
+    seed ^= ((luma_num * 173 + 105) & 255);
+    // Begin special iteration for x == 0.
+    const int rand = GetFilmGrainRandomNumber(8, &seed);
+    const int offset_x = rand >> 4;
+    const int offset_y = rand & 15;
+    const int plane_offset_x =
+        (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+    const int plane_offset_y =
+        (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+    // The overlap computation only occurs when x > 0, so it is omitted here.
+    int i = 0;
+    do {
+      // TODO(petersonab): Allocate aligned stripes with extra width to cover
+      // the size of the final stripe block, then remove this call to min.
+      const int copy_size =
+          std::min(kNoiseStripeHeight >> subsampling_x, plane_width);
+      memcpy(&noise_stripe[i * plane_width],
+             &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+             copy_size * sizeof(noise_stripe[0]));
+    } while (++i < (kNoiseStripeHeight >> subsampling_y));
+    // End special iteration for x == 0.
+    for (int x = 16; x < half_width; x += 16) {
+      const int rand = GetFilmGrainRandomNumber(8, &seed);
+      const int offset_x = rand >> 4;
+      const int offset_y = rand & 15;
+      const int plane_offset_x =
+          (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+      const int plane_offset_y =
+          (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+      int i = 0;
+      do {
+        int j = 0;
+        int grain_sample =
+            grain[(plane_offset_y + i) * grain_width + plane_offset_x];
+        // The first pixel(s) of each segment of the noise_stripe are subject to
+        // the "overlap" computation.
+        if (subsampling_x == 0) {
+          // Corresponds to the line in the spec:
+          // if (j < 2 && x > 0)
+          // j = 0
+          int old = noise_stripe[i * plane_width + x * 2];
+          grain_sample = old * 27 + grain_sample * 17;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x * 2] = grain_sample;
+
+          // This check prevents overwriting for the iteration j = 1. The
+          // continue applies to the i-loop.
+          if (x * 2 + 1 >= plane_width) continue;
+          // j = 1
+          grain_sample =
+              grain[(plane_offset_y + i) * grain_width + plane_offset_x + 1];
+          old = noise_stripe[i * plane_width + x * 2 + 1];
+          grain_sample = old * 17 + grain_sample * 27;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x * 2 + 1] = grain_sample;
+          j = 2;
+        } else {
+          // Corresponds to the line in the spec:
+          // if (j == 0 && x > 0)
+          const int old = noise_stripe[i * plane_width + x];
+          grain_sample = old * 23 + grain_sample * 22;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x] = grain_sample;
+          j = 1;
+        }
+        // The following covers the rest of the loop over j as described in the
+        // spec.
+        //
+        // Section 7.18.3.5 says:
+        //   noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+        //   wide (a few additional samples across are actually written to
+        //   the array, but these are never read) ...
+        //
+        // Note: The warning in the parentheses also applies to
+        // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+        //
+        // Writes beyond the width of each row could happen below. To
+        // prevent those writes, we clip the number of pixels to copy against
+        // the remaining width.
+        // TODO(petersonab): Allocate aligned stripes with extra width to cover
+        // the size of the final stripe block, then remove this call to min.
+        const int copy_size =
+            std::min(kNoiseStripeHeight >> subsampling_x,
+                     plane_width - (x << (1 - subsampling_x))) -
+            j;
+        memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x)) + j],
+               &grain[(plane_offset_y + i) * grain_width + plane_offset_x + j],
+               copy_size * sizeof(noise_stripe[0]));
+      } while (++i < (kNoiseStripeHeight >> subsampling_y));
+    }
+
+    ++luma_num;
+    y += 16;
+  } while (y < half_height);
+}
+
+template <int bitdepth, typename GrainType>
+inline void WriteOverlapLine_C(const GrainType* noise_stripe_row,
+                               const GrainType* noise_stripe_row_prev,
+                               int plane_width, int grain_coeff, int old_coeff,
+                               GrainType* noise_image_row) {
+  int x = 0;
+  do {
+    int grain = noise_stripe_row[x];
+    const int old = noise_stripe_row_prev[x];
+    grain = old * old_coeff + grain * grain_coeff;
+    grain = Clip3(RightShiftWithRounding(grain, 5), GetGrainMin<bitdepth>(),
+                  GetGrainMax<bitdepth>());
+    noise_image_row[x] = grain;
+  } while (++x < plane_width);
+}
+
+template <int bitdepth, typename GrainType>
+void ConstructNoiseImageOverlap_C(const void* noise_stripes_buffer, int width,
+                                  int height, int subsampling_x,
+                                  int subsampling_y, void* noise_image_buffer) {
+  const auto* noise_stripes =
+      static_cast<const Array2DView<GrainType>*>(noise_stripes_buffer);
+  auto* noise_image = static_cast<Array2D<GrainType>*>(noise_image_buffer);
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = stripe_height;
+  int luma_num = 1;
+  if (subsampling_y == 0) {
+    // Begin complete stripes section. This is when we are guaranteed to have
+    // two overlap rows in each stripe.
+    for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+      const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+      const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      // First overlap row.
+      WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                   &noise_stripe_prev[32 * plane_width],
+                                   plane_width, 17, 27, (*noise_image)[y]);
+      // Second overlap row.
+      WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+                                   &noise_stripe_prev[(32 + 1) * plane_width],
+                                   plane_width, 27, 17, (*noise_image)[y + 1]);
+    }
+    // End complete stripes section.
+
+    const int remaining_height = plane_height - y;
+    // Either one partial stripe remains (remaining_height  > 0),
+    // OR image is less than one stripe high (remaining_height < 0),
+    // OR all stripes are completed (remaining_height == 0).
+    if (remaining_height <= 0) {
+      return;
+    }
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+    WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                 &noise_stripe_prev[32 * plane_width],
+                                 plane_width, 17, 27, (*noise_image)[y]);
+
+    // Check if second overlap row is in the image.
+    if (remaining_height > 1) {
+      WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+                                   &noise_stripe_prev[(32 + 1) * plane_width],
+                                   plane_width, 27, 17, (*noise_image)[y + 1]);
+    }
+  } else {  // |subsampling_y| == 1
+    // No special checks needed for partial stripes, because if one exists, the
+    // first and only overlap row is guaranteed to exist.
+    for (; y < plane_height; ++luma_num, y += stripe_height) {
+      const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+      const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                   &noise_stripe_prev[16 * plane_width],
+                                   plane_width, 22, 23, (*noise_image)[y]);
+    }
+  }
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_C(
+    const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
+    int width, int height, int start_height,
+    const uint8_t scaling_lut_y[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
+    ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int orig = in_y[y * source_stride_y + x];
+      int noise = noise_image[kPlaneY][y + start_height][x];
+      noise = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut_y, orig) * noise, scaling_shift);
+      out_y[y * dest_stride_y + x] = Clip3(orig + noise, min_value, max_luma);
+    } while (++x < width);
+  } while (++y < height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChroma_C(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_chroma, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut_uv[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+
+  const int scaling_shift = params.chroma_scaling;
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int luma_y = y << subsampling_y;
+      const int luma_next_x = std::min(luma_x + 1, width - 1);
+      int average_luma;
+      if (subsampling_x != 0) {
+        average_luma = RightShiftWithRounding(
+            in_y[luma_y * source_stride_y + luma_x] +
+                in_y[luma_y * source_stride_y + luma_next_x],
+            1);
+      } else {
+        average_luma = in_y[luma_y * source_stride_y + luma_x];
+      }
+      const int orig = in_uv[y * source_stride_uv + x];
+      const int combined = average_luma * luma_multiplier + orig * multiplier;
+      const int merged =
+          Clip3((combined >> 6) + LeftShift(offset, bitdepth - 8), 0,
+                (1 << bitdepth) - 1);
+      int noise = noise_image[plane][y + start_height][x];
+      noise = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut_uv, merged) * noise, scaling_shift);
+      out_uv[y * dest_stride_uv + x] =
+          Clip3(orig + noise, min_value, max_chroma);
+    } while (++x < chroma_width);
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_C(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_chroma, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int scaling_shift = params.chroma_scaling;
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int luma_y = y << subsampling_y;
+      const int luma_next_x = std::min(luma_x + 1, width - 1);
+      int average_luma;
+      if (subsampling_x != 0) {
+        average_luma = RightShiftWithRounding(
+            in_y[luma_y * source_stride_y + luma_x] +
+                in_y[luma_y * source_stride_y + luma_next_x],
+            1);
+      } else {
+        average_luma = in_y[luma_y * source_stride_y + luma_x];
+      }
+      const int orig_uv = in_uv[y * source_stride_uv + x];
+      int noise_uv = noise_image[plane][y + start_height][x];
+      noise_uv = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut, average_luma) * noise_uv,
+          scaling_shift);
+      out_uv[y * dest_stride_uv + x] =
+          Clip3(orig_uv + noise_uv, min_value, max_chroma);
+    } while (++x < chroma_width);
+  } while (++y < chroma_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+
+  // ChromaAutoRegressionFunc
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, true>;
+
+  // ConstructNoiseStripesFunc
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<8, int8_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<8, int8_t>;
+
+  // ConstructNoiseImageOverlapFunc
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<8, int8_t>;
+
+  // InitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>;
+
+  // BlendNoiseWithImageLumaFunc
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<8, int8_t, uint8_t>;
+
+  // BlendNoiseWithImageChromaFunc
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<8, int8_t, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseStripes
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<8, int8_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<8, int8_t, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+
+  // ChromaAutoRegressionFunc
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, true>;
+
+  // ConstructNoiseStripesFunc
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<10, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<10, int16_t>;
+
+  // ConstructNoiseImageOverlapFunc
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<10, int16_t>;
+
+  // InitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>;
+
+  // BlendNoiseWithImageLumaFunc
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<10, int16_t, uint16_t>;
+
+  // BlendNoiseWithImageChromaFunc
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<10, int16_t, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseStripes
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<10, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<10, int16_t, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace film_grain
+
+void FilmGrainInit_C() {
+  film_grain::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/film_grain.h b/src/dsp/film_grain.h
new file mode 100644
index 0000000..fe93270
--- /dev/null
+++ b/src/dsp/film_grain.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/film_grain_neon.h"
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize Dsp::film_grain_synthesis. This function is not thread-safe.
+void FilmGrainInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_FILM_GRAIN_H_
diff --git a/src/dsp/film_grain_common.h b/src/dsp/film_grain_common.h
new file mode 100644
index 0000000..64e3e8e
--- /dev/null
+++ b/src/dsp/film_grain_common.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+
+template <int bitdepth>
+int GetGrainMax() {
+  return (1 << (bitdepth - 1)) - 1;
+}
+
+template <int bitdepth>
+int GetGrainMin() {
+  return -(1 << (bitdepth - 1));
+}
+
+inline int GetFilmGrainRandomNumber(int bits, uint16_t* seed) {
+  uint16_t s = *seed;
+  uint16_t bit = (s ^ (s >> 1) ^ (s >> 3) ^ (s >> 12)) & 1;
+  s = (s >> 1) | (bit << 15);
+  *seed = s;
+  return s >> (16 - bits);
+}
+
+enum {
+  kAutoRegressionBorder = 3,
+  // The width of the luma noise array.
+  kLumaWidth = 82,
+  // The height of the luma noise array.
+  kLumaHeight = 73,
+  // The two possible widths of the chroma noise array.
+  kMinChromaWidth = 44,
+  kMaxChromaWidth = 82,
+  // The two possible heights of the chroma noise array.
+  kMinChromaHeight = 38,
+  kMaxChromaHeight = 73,
+  // The scaling lookup table maps bytes to bytes, so only uses 256 elements,
+  // plus one for overflow in 10bit lookups.
+  kScalingLookupTableSize = 257,
+  // Padding is added to the scaling lookup table to permit overwrites by
+  // InitializeScalingLookupTable_NEON.
+  kScalingLookupTablePadding = 6,
+  // Padding is added to each row of the noise image to permit overreads by
+  // BlendNoiseWithImageLuma_NEON and overwrites by WriteOverlapLine8bpp_NEON.
+  kNoiseImagePadding = 7,
+  // Padding is added to the end of the |noise_stripes_| buffer to permit
+  // overreads by WriteOverlapLine8bpp_NEON.
+  kNoiseStripePadding = 7,
+};  // anonymous enum
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
diff --git a/src/dsp/intra_edge.cc b/src/dsp/intra_edge.cc
new file mode 100644
index 0000000..fe66db2
--- /dev/null
+++ b/src/dsp/intra_edge.cc
@@ -0,0 +1,115 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+    {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxUpsampleSize = 16;
+
+template <typename Pixel>
+void IntraEdgeFilter_C(void* buffer, int size, int strength) {
+  assert(strength > 0);
+  Pixel edge[129];
+  memcpy(edge, buffer, sizeof(edge[0]) * size);
+  auto* const dst_buffer = static_cast<Pixel*>(buffer);
+  const int kernel_index = strength - 1;
+  for (int i = 1; i < size; ++i) {
+    int sum = 0;
+    for (int j = 0; j < kKernelTaps; ++j) {
+      const int k = Clip3(i + j - 2, 0, size - 1);
+      sum += kKernels[kernel_index][j] * edge[k];
+    }
+    dst_buffer[i] = RightShiftWithRounding(sum, 4);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsampler_C(void* buffer, int size) {
+  assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+  auto* const pixel_buffer = static_cast<Pixel*>(buffer);
+  Pixel temp[kMaxUpsampleSize + 3];
+  temp[0] = temp[1] = pixel_buffer[-1];
+  memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+  temp[size + 2] = pixel_buffer[size - 1];
+
+  pixel_buffer[-2] = temp[0];
+  for (int i = 0; i < size; ++i) {
+    const int sum =
+        -temp[i] + (9 * temp[i + 1]) + (9 * temp[i + 2]) - temp[i + 3];
+    pixel_buffer[2 * i - 1] =
+        Clip3(RightShiftWithRounding(sum, 4), 0, (1 << bitdepth) - 1);
+    pixel_buffer[2 * i] = temp[i + 2];
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeFilter
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeUpsampler
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void IntraEdgeInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intra_edge.h b/src/dsp/intra_edge.h
new file mode 100644
index 0000000..172ecbb
--- /dev/null
+++ b/src/dsp/intra_edge.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+#define LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intra_edge_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intra_edge_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRA_EDGE_H_
diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc
new file mode 100644
index 0000000..4bcb580
--- /dev/null
+++ b/src/dsp/intrapred.cc
@@ -0,0 +1,2911 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>  // memset
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+    kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+    kTransformSize64x32, kTransformSize64x64};
+
+template <int block_width, int block_height, typename Pixel>
+struct IntraPredFuncs_C {
+  IntraPredFuncs_C() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+  static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+                       const void* left_column);
+  static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+                         const void* left_column);
+  static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+                             const void* left_column);
+  static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+                               const void* top_row, const void* left_column);
+};
+
+// Intra-predictors that require bitdepth.
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+struct IntraPredBppFuncs_C {
+  IntraPredBppFuncs_C() = delete;
+
+  static void DcFill(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+};
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C::DcPred
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcTop(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* /*left_column*/) {
+  int sum = block_width >> 1;  // rounder
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  for (int x = 0; x < block_width; ++x) sum += top[x];
+  const int dc = sum >> FloorLog2(block_width);
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcLeft(
+    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+    const void* const left_column) {
+  int sum = block_height >> 1;  // rounder
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  for (int y = 0; y < block_height; ++y) sum += left[y];
+  const int dc = sum >> FloorLog2(block_height);
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+// Note for square blocks the divide in the Dc() function reduces to a shift.
+// For rectangular block sizes the following multipliers can be used with the
+// corresponding shifts.
+// 8-bit
+//  1:2 (e.g,, 4x8):  scale = 0x5556
+//  1:4 (e.g., 4x16): scale = 0x3334
+//  final_descale = 16
+// 10/12-bit
+//  1:2: scale = 0xaaab
+//  1:4: scale = 0x6667
+//  final_descale = 17
+//  Note these may be halved to the values used in 8-bit in all cases except
+//  when bitdepth == 12 and block_width + block_height is divisible by 5 (as
+//  opposed to 3).
+//
+// The calculation becomes:
+//  (dc_sum >> intermediate_descale) * scale) >> final_descale
+// where intermediate_descale is:
+// sum = block_width + block_height
+// intermediate_descale =
+//     (sum <= 20) ? 2 : (sum <= 40) ? 3 : (sum <= 80) ? 4 : 5
+//
+// The constants (multiplier and shifts) for a given block size are obtained
+// as follows:
+// - Let sum = block width + block height
+// - Shift 'sum' right until we reach an odd number
+// - Let the number of shifts for that block size be called 'intermediate_scale'
+//   and let the odd number be 'd' (d has only 2 possible values: d = 3 for a
+//   1:2 rectangular block and d = 5 for a 1:4 rectangular block).
+// - Find multipliers by dividing by 'd' using "Algorithm 1" in:
+//   http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+//   by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+//   shift will be 16, regardless of the block size.
+// TODO(jzern): the base implementation could be updated to use this method.
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Dc(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const int divisor = block_width + block_height;
+  int sum = divisor >> 1;  // rounder
+
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  for (int x = 0; x < block_width; ++x) sum += top[x];
+  for (int y = 0; y < block_height; ++y) sum += left[y];
+
+  const int dc = sum / divisor;
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C directional predictors
+
+// IntraPredFuncs_C::Vertical -- apply top row vertically
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Vertical(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* /*left_column*/) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < block_height; ++y) {
+    memcpy(dst, top_row, block_width * sizeof(Pixel));
+    dst += stride;
+  }
+}
+
+// IntraPredFuncs_C::Horizontal -- apply left column horizontally
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
+    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+    const void* const left_column) {
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, left[y], block_width);
+    dst += stride;
+  }
+}
+
+template <typename Pixel>
+inline Pixel Average(Pixel a, Pixel b) {
+  return static_cast<Pixel>((a + b + 1) >> 1);
+}
+
+template <typename Pixel>
+inline Pixel Average(Pixel a, Pixel b, Pixel c) {
+  return static_cast<Pixel>((a + 2 * b + c + 2) >> 2);
+}
+
+// IntraPredFuncs_C::Paeth
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_left = top[-1];
+  const int top_left_x2 = top_left + top_left;
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    const int left_pixel = left[y];
+    for (int x = 0; x < block_width; ++x) {
+      // The Paeth filter selects the value closest to:
+      // top[x] + left[y] - top_left
+      // To calculate the absolute distance for the left value this would be:
+      // abs((top[x] + left[y] - top_left) - left[y])
+      // or, because left[y] cancels out:
+      // abs(top[x] - top_left)
+      const int left_dist = std::abs(top[x] - top_left);
+      const int top_dist = std::abs(left_pixel - top_left);
+      const int top_left_dist = std::abs(top[x] + left_pixel - top_left_x2);
+
+      // Select the closest value to the initial estimate of 'T + L - TL'.
+      if (left_dist <= top_dist && left_dist <= top_left_dist) {
+        dst[x] = left_pixel;
+      } else if (top_dist <= top_left_dist) {
+        dst[x] = top[x];
+      } else {
+        dst[x] = top_left;
+      }
+    }
+    dst += stride;
+  }
+}
+
+constexpr uint8_t kSmoothWeights[] = {
+    // block dimension = 4
+    255, 149, 85, 64,
+    // block dimension = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // block dimension = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // block dimension = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // block dimension = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+// IntraPredFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Smooth(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(
+      block_width >= 4 && block_height >= 4,
+      "Weights for smooth predictor undefined for block width/height < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+      // + 256. With the descale there's no need for saturation.
+      dst[x] = static_cast<Pixel>(
+          RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+    }
+    dst += stride;
+  }
+}
+
+// IntraPredFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(block_height >= 4,
+                "Weights for smooth predictor undefined for block height < 4");
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// IntraPredFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  static_assert(block_width >= 4,
+                "Weights for smooth predictor undefined for block width < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_x[x]);
+      uint32_t pred = weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredBppFuncs_C
+template <int fill, typename Pixel>
+inline void DcFill_C(void* const dest, ptrdiff_t stride, const int block_width,
+                     const int block_height) {
+  static_assert(sizeof(Pixel) == 1 || sizeof(Pixel) == 2,
+                "Only 1 & 2 byte pixels are supported");
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, fill, block_width);
+    dst += stride;
+  }
+}
+
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void IntraPredBppFuncs_C<block_width, block_height, bitdepth, Pixel>::DcFill(
+    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+    const void* /*left_column*/) {
+  DcFill_C<0x80 << (bitdepth - 8), Pixel>(dest, stride, block_width,
+                                          block_height);
+}
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
+                            const void* const top_row,
+                            const void* const left_column,
+                            const FilterIntraPredictor pred, const int width,
+                            const int height) {
+  const int kMaxPixel = (1 << bitdepth) - 1;
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  Pixel buffer[3][33];  // cache 2 rows + top & left boundaries
+  memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  int row0 = 0, row2 = 2;
+  int ystep = 1;
+  int y = 0;
+  do {
+    buffer[1][0] = left[y];
+    buffer[row2][0] = left[y + 1];
+    int x = 1;
+    do {
+      const Pixel p0 = buffer[row0][x - 1];  // top-left
+      const Pixel p1 = buffer[row0][x + 0];  // top 0
+      const Pixel p2 = buffer[row0][x + 1];  // top 1
+      const Pixel p3 = buffer[row0][x + 2];  // top 2
+      const Pixel p4 = buffer[row0][x + 3];  // top 3
+      const Pixel p5 = buffer[1][x - 1];     // left 0
+      const Pixel p6 = buffer[row2][x - 1];  // left 1
+      for (int i = 0; i < 8; ++i) {
+        const int xoffset = i & 0x03;
+        const int yoffset = (i >> 2) * ystep;
+        const int value = kFilterIntraTaps[pred][i][0] * p0 +
+                          kFilterIntraTaps[pred][i][1] * p1 +
+                          kFilterIntraTaps[pred][i][2] * p2 +
+                          kFilterIntraTaps[pred][i][3] * p3 +
+                          kFilterIntraTaps[pred][i][4] * p4 +
+                          kFilterIntraTaps[pred][i][5] * p5 +
+                          kFilterIntraTaps[pred][i][6] * p6;
+        buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+            Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+      }
+      x += 4;
+    } while (x < width);
+    memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+    dst += stride;
+
+    // The final row becomes the top for the next pass.
+    row0 ^= 2;
+    row2 ^= 2;
+    ystep = -ystep;
+    y += 2;
+  } while (y < height);
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+    void* const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<Pixel*>(dest);
+  const int dc = dst[0];
+  stride /= sizeof(Pixel);
+  const int max_value = (1 << bitdepth) - 1;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+      assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+      dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+                     0, max_value);
+    }
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+          int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+                     const int max_luma_width, const int max_luma_height,
+                     const void* const source, ptrdiff_t stride) {
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const auto* src = static_cast<const Pixel*>(source);
+  stride /= sizeof(Pixel);
+  int sum = 0;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      const ptrdiff_t luma_x =
+          std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+      const ptrdiff_t luma_x_next = luma_x + stride;
+      luma[y][x] =
+          (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+           ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+                                 : 0))
+          << (3 - subsampling_x - subsampling_y);
+      sum += luma[y][x];
+    }
+    if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+      src += stride << subsampling_y;
+    }
+  }
+  const int average = RightShiftWithRounding(
+      sum, FloorLog2(block_width) + FloorLog2(block_height));
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      luma[y][x] -= average;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
+                                      const void* const top_row,
+                                      const int width, const int height,
+                                      const int xstep,
+                                      const bool upsampled_top) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+
+  // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+  // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+  if (xstep == 64) {
+    // 7.11.2.10. Intra edge upsample selection process
+    // if ( d <= 0 || d >= 40 ) useUpsample = 0
+    // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+    // |predictor_angle| is 45 the delta is also 45.
+    assert(!upsampled_top);
+    const Pixel* top_ptr = top + 1;
+    for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+      memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+    }
+    return;
+  }
+
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        Memset(dst, top[max_base_x], width);
+        dst += stride;
+      }
+      return;
+    }
+
+    const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+    int x = 0;
+    do {
+      if (top_base_x >= max_base_x) {
+        Memset(dst + x, top[max_base_x], width - x);
+        break;
+      }
+
+      const int val =
+          top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5);
+      top_base_x += base_step;
+    } while (++x < width);
+
+    dst += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
+                                      const void* const top_row,
+                                      const void* const left_column,
+                                      const int width, const int height,
+                                      const int xstep, const int ystep,
+                                      const bool upsampled_top,
+                                      const bool upsampled_left) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+  assert(ystep > 0);
+
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int scale_bits_x = 6 - upsample_top_shift;
+  const int scale_bits_y = 6 - upsample_left_shift;
+  const int min_base_x = -(1 << upsample_top_shift);
+  const int base_step_x = 1 << upsample_top_shift;
+  int y = 0;
+  int top_x = -xstep;
+  do {
+    int top_base_x = top_x >> scale_bits_x;
+    int left_y = (y << 6) - ystep;
+    int x = 0;
+    do {
+      int val;
+      if (top_base_x >= min_base_x) {
+        const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+        val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      } else {
+        // Note this assumes an arithmetic shift to handle negative values.
+        const int left_base_y = left_y >> scale_bits_y;
+        const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+        assert(left_base_y >= -(1 << upsample_left_shift));
+        val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      }
+      dst[x] = RightShiftWithRounding(val, 5);
+      top_base_x += base_step_x;
+      left_y -= ystep;
+    } while (++x < width);
+
+    top_x -= xstep;
+    dst += stride;
+  } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
+                                      const void* const left_column,
+                                      const int width, const int height,
+                                      const int ystep,
+                                      const bool upsampled_left) {
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  stride /= sizeof(Pixel);
+
+  assert(ystep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_left);
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> scale_bits) +
+             base_step * (height - 1));  // left_base_y
+
+  int left_y = ystep;
+  int x = 0;
+  do {
+    auto* dst = static_cast<Pixel*>(dest);
+
+    int left_base_y = left_y >> scale_bits;
+    int y = 0;
+    do {
+      const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+      const int val =
+          left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5);
+      dst += stride;
+      left_base_y += base_step;
+    } while (++y < height);
+
+    left_y += ystep;
+  } while (++x < width);
+}
+
+//------------------------------------------------------------------------------
+
+template <typename Pixel>
+struct IntraPredDefs {
+  IntraPredDefs() = delete;
+
+  using _4x4 = IntraPredFuncs_C<4, 4, Pixel>;
+  using _4x8 = IntraPredFuncs_C<4, 8, Pixel>;
+  using _4x16 = IntraPredFuncs_C<4, 16, Pixel>;
+  using _8x4 = IntraPredFuncs_C<8, 4, Pixel>;
+  using _8x8 = IntraPredFuncs_C<8, 8, Pixel>;
+  using _8x16 = IntraPredFuncs_C<8, 16, Pixel>;
+  using _8x32 = IntraPredFuncs_C<8, 32, Pixel>;
+  using _16x4 = IntraPredFuncs_C<16, 4, Pixel>;
+  using _16x8 = IntraPredFuncs_C<16, 8, Pixel>;
+  using _16x16 = IntraPredFuncs_C<16, 16, Pixel>;
+  using _16x32 = IntraPredFuncs_C<16, 32, Pixel>;
+  using _16x64 = IntraPredFuncs_C<16, 64, Pixel>;
+  using _32x8 = IntraPredFuncs_C<32, 8, Pixel>;
+  using _32x16 = IntraPredFuncs_C<32, 16, Pixel>;
+  using _32x32 = IntraPredFuncs_C<32, 32, Pixel>;
+  using _32x64 = IntraPredFuncs_C<32, 64, Pixel>;
+  using _64x16 = IntraPredFuncs_C<64, 16, Pixel>;
+  using _64x32 = IntraPredFuncs_C<64, 32, Pixel>;
+  using _64x64 = IntraPredFuncs_C<64, 64, Pixel>;
+};
+
+template <int bitdepth, typename Pixel>
+struct IntraPredBppDefs {
+  IntraPredBppDefs() = delete;
+
+  using _4x4 = IntraPredBppFuncs_C<4, 4, bitdepth, Pixel>;
+  using _4x8 = IntraPredBppFuncs_C<4, 8, bitdepth, Pixel>;
+  using _4x16 = IntraPredBppFuncs_C<4, 16, bitdepth, Pixel>;
+  using _8x4 = IntraPredBppFuncs_C<8, 4, bitdepth, Pixel>;
+  using _8x8 = IntraPredBppFuncs_C<8, 8, bitdepth, Pixel>;
+  using _8x16 = IntraPredBppFuncs_C<8, 16, bitdepth, Pixel>;
+  using _8x32 = IntraPredBppFuncs_C<8, 32, bitdepth, Pixel>;
+  using _16x4 = IntraPredBppFuncs_C<16, 4, bitdepth, Pixel>;
+  using _16x8 = IntraPredBppFuncs_C<16, 8, bitdepth, Pixel>;
+  using _16x16 = IntraPredBppFuncs_C<16, 16, bitdepth, Pixel>;
+  using _16x32 = IntraPredBppFuncs_C<16, 32, bitdepth, Pixel>;
+  using _16x64 = IntraPredBppFuncs_C<16, 64, bitdepth, Pixel>;
+  using _32x8 = IntraPredBppFuncs_C<32, 8, bitdepth, Pixel>;
+  using _32x16 = IntraPredBppFuncs_C<32, 16, bitdepth, Pixel>;
+  using _32x32 = IntraPredBppFuncs_C<32, 32, bitdepth, Pixel>;
+  using _32x64 = IntraPredBppFuncs_C<32, 64, bitdepth, Pixel>;
+  using _64x16 = IntraPredBppFuncs_C<64, 16, bitdepth, Pixel>;
+  using _64x32 = IntraPredBppFuncs_C<64, 32, bitdepth, Pixel>;
+  using _64x64 = IntraPredBppFuncs_C<64, 64, bitdepth, Pixel>;
+};
+
+using Defs = IntraPredDefs<uint8_t>;
+using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS|/|DEFSBPP| of
+// the same size.
+#define INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, W, H)                         \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcFill] =     \
+      DEFSBPP::_##W##x##H::DcFill;                                            \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcTop] =      \
+      DEFS::_##W##x##H::DcTop;                                                \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcLeft] =     \
+      DEFS::_##W##x##H::DcLeft;                                               \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDc] =         \
+      DEFS::_##W##x##H::Dc;                                                   \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorVertical] =   \
+      DEFS::_##W##x##H::Vertical;                                             \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
+      DEFS::_##W##x##H::Horizontal;                                           \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] =      \
+      DEFS::_##W##x##H::Paeth;                                                \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] =     \
+      DEFS::_##W##x##H::Smooth;                                               \
+  dsp->intra_predictors[kTransformSize##W##x##H]                              \
+                       [kIntraPredictorSmoothVertical] =                      \
+      DEFS::_##W##x##H::SmoothVertical;                                       \
+  dsp->intra_predictors[kTransformSize##W##x##H]                              \
+                       [kIntraPredictorSmoothHorizontal] =                    \
+      DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_INTRAPREDICTORS(DEFS, DEFSBPP)        \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 8);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 16);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 4);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 8);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 16);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 32);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 4);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 8);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 64); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 8);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 64); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
+
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL)             \
+  dsp->cfl_intra_predictors[kTransformSize##W##x##H] =                 \
+      CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>;                      \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL)       \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_INTRAPREDICTORS(Defs, Defs8bpp);
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+  INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+      Defs8bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      Defs::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      Defs::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = Defs::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      Defs::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      Defs::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Defs::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+      Defs8bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      Defs::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      Defs::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = Defs::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      Defs::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      Defs::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Defs::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+      Defs8bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      Defs::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      Defs::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      Defs::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      Defs::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      Defs::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Defs::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+      Defs8bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      Defs::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      Defs::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = Defs::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      Defs::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      Defs::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Defs::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+      Defs8bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      Defs::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      Defs::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = Defs::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      Defs::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      Defs::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Defs::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+      Defs8bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      Defs::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      Defs::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      Defs::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      Defs::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      Defs::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Defs::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+      Defs8bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      Defs::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      Defs::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      Defs::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      Defs::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      Defs::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Defs::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+      Defs8bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      Defs::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      Defs::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      Defs::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      Defs::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      Defs::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Defs::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+      Defs8bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      Defs::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      Defs::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      Defs::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      Defs::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      Defs::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Defs::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+      Defs8bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      Defs::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      Defs::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      Defs::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      Defs::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      Defs::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Defs::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+      Defs8bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      Defs::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      Defs::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      Defs::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      Defs::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      Defs::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Defs::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+      Defs8bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      Defs::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      Defs::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      Defs::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      Defs::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      Defs::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Defs::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+      Defs8bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      Defs::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      Defs::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      Defs::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      Defs::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      Defs::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Defs::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+      Defs8bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      Defs::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      Defs::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      Defs::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      Defs::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      Defs::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Defs::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+      Defs8bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      Defs::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      Defs::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      Defs::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      Defs::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      Defs::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Defs::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+      Defs8bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      Defs::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      Defs::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      Defs::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      Defs::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      Defs::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Defs::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+      Defs8bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      Defs::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      Defs::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      Defs::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      Defs::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      Defs::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Defs::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+      Defs8bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      Defs::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      Defs::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      Defs::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      Defs::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      Defs::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Defs::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+      Defs8bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      Defs::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      Defs::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      Defs::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      Defs::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      Defs::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Defs::_64x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = IntraPredDefs<uint16_t>;
+using Defs10bpp = IntraPredBppDefs<10, uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+  INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+      Defs10bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+      Defs10bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+      Defs10bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+      Defs10bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+      Defs10bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+      Defs10bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+      Defs10bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+      Defs10bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+      Defs10bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+      Defs10bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+      Defs10bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+      Defs10bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+      Defs10bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+      Defs10bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+      Defs10bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+      Defs10bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+      Defs10bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+      Defs10bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+      Defs10bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      DefsHbd::_64x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+#undef INIT_INTRAPREDICTORS_WxH
+#undef INIT_INTRAPREDICTORS
+
+}  // namespace
+
+void IntraPredInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred.h b/src/dsp/intrapred.h
new file mode 100644
index 0000000..c5286ef
--- /dev/null
+++ b/src/dsp/intrapred.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_H_
diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc
new file mode 100644
index 0000000..a03fad2
--- /dev/null
+++ b/src/dsp/inverse_transform.cc
@@ -0,0 +1,1636 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+constexpr uint8_t kTransformColumnShift = 4;
+
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#endif
+
+int32_t RangeCheckValue(int32_t value, int8_t range) {
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+    LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+  assert(range <= 32);
+  const int32_t min = -(1 << (range - 1));
+  const int32_t max = (1 << (range - 1)) - 1;
+  if (min > value || value > max) {
+    LIBGAV1_DLOG(ERROR, "coeff out of bit range, value: %d bit range %d\n",
+                 value, range);
+    assert(min <= value && value <= max);
+  }
+#endif  // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+  static_cast<void>(range);
+  return value;
+}
+
+template <typename Residual>
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_C(Residual* const dst, int a,
+                                               int b, int angle, bool flip,
+                                               int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const int64_t x = static_cast<int64_t>(dst[a] * Cos128(angle)) -
+                    static_cast<int64_t>(dst[b] * Sin128(angle));
+  const int64_t y = static_cast<int64_t>(dst[a] * Sin128(angle)) +
+                    static_cast<int64_t>(dst[b] * Cos128(angle));
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationFirstIsZero_C(Residual* const dst, int a, int b,
+                                    int angle, bool flip, int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const auto x = static_cast<int64_t>(dst[b] * -Sin128(angle));
+  const auto y = static_cast<int64_t>(dst[b] * Cos128(angle));
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationSecondIsZero_C(Residual* const dst, int a, int b,
+                                     int angle, bool flip, int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const auto x = static_cast<int64_t>(dst[a] * Cos128(angle));
+  const auto y = static_cast<int64_t>(dst[a] * Sin128(angle));
+
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void HadamardRotation_C(Residual* const dst, int a, int b, bool flip,
+                        int8_t range) {
+  if (flip) std::swap(a, b);
+  --range;
+  // For Adst and Dct, the maximum possible value for range is 20. So min and
+  // max should always fit into int32_t.
+  const int32_t min = -(1 << range);
+  const int32_t max = (1 << range) - 1;
+  const int32_t x = dst[a] + dst[b];
+  const int32_t y = dst[a] - dst[b];
+  dst[a] = Clip3(x, min, max);
+  dst[b] = Clip3(y, min, max);
+}
+
+template <int bitdepth, typename Residual>
+void ClampIntermediate(Residual* const dst, int size) {
+  // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+  // clip residual[i][j] to 16 bits.
+  if (sizeof(Residual) > 2) {
+    const Residual intermediate_clamp_max =
+        (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+    const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+    for (int j = 0; j < size; ++j) {
+      dst[j] = Clip3(dst[j], intermediate_clamp_min, intermediate_clamp_max);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+// Value for index (i, j) is computed as bitreverse(j) and interpreting that as
+// an integer with bit-length i + 2.
+// For e.g. index (2, 3) will be computed as follows:
+//   * bitreverse(3) = bitreverse(..000011) = 110000...
+//   * interpreting that as an integer with bit-length 2+2 = 4 will be 1100 = 12
+constexpr uint8_t kBitReverseLookup[kNum1DTransformSizes][64] = {
+    {0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2,
+     1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3,
+     0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3},
+    {0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5,
+     3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6,
+     1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7},
+    {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15},
+    {0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+     1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+     0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+     1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31},
+    {0, 32, 16, 48, 8,  40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+     2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+     1, 33, 17, 49, 9,  41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+     3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63}};
+
+template <typename Residual, int size_log2>
+void Dct_C(void* dest, int8_t range) {
+  static_assert(size_log2 >= 2 && size_log2 <= 6, "");
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  const int size = 1 << size_log2;
+  Residual temp[size];
+  memcpy(temp, dst, sizeof(temp));
+  for (int i = 0; i < size; ++i) {
+    dst[i] = temp[kBitReverseLookup[size_log2 - 2][i]];
+  }
+  // stages 2-32 are dependent on the value of size_log2.
+  // stage 2.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 16; ++i) {
+      ButterflyRotation_C(dst, i + 32, 63 - i,
+                          63 - MultiplyBy4(kBitReverseLookup[2][i]), false,
+                          range);
+    }
+  }
+  // stage 3
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, i + 16, 31 - i,
+                          6 + MultiplyBy8(kBitReverseLookup[1][7 - i]), false,
+                          range);
+    }
+  }
+  // stage 4.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 16; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 32, MultiplyBy2(i) + 33,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 5.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, i + 8, 15 - i,
+                          12 + MultiplyBy16(kBitReverseLookup[0][3 - i]), false,
+                          range);
+    }
+  }
+  // stage 6.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 16, MultiplyBy2(i) + 17,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 7.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        ButterflyRotation_C(
+            dst, 62 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 33,
+            60 - MultiplyBy16(kBitReverseLookup[0][i]) + MultiplyBy64(j), true,
+            range);
+      }
+    }
+  }
+  // stage 8.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, i + 4, 7 - i, 56 - 32 * i, false, range);
+    }
+  }
+  // stage 9.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 4; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 10.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        ButterflyRotation_C(
+            dst, 30 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 17,
+            24 + MultiplyBy64(j) + MultiplyBy32(1 - i), true, range);
+      }
+    }
+  }
+  // stage 11.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 32,
+                           MultiplyBy4(i) - j + 35, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 12.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(dst, MultiplyBy2(i), MultiplyBy2(i) + 1, 32 + 16 * i,
+                        i == 0, range);
+  }
+  // stage 13.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 2; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 4, MultiplyBy2(i) + 5,
+                         /*flip=*/i != 0, range);
+    }
+  }
+  // stage 14.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, 14 - i, i + 9, 48 + 64 * i, true, range);
+    }
+  }
+  // stage 15.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 16,
+                           MultiplyBy4(i) - j + 19, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 16.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        ButterflyRotation_C(
+            dst, 61 - MultiplyBy8(i) - j, MultiplyBy8(i) + j + 34,
+            56 - MultiplyBy32(i) + MultiplyBy64(DivideBy2(j)), true, range);
+      }
+    }
+  }
+  // stage 17.
+  for (int i = 0; i < 2; ++i) {
+    HadamardRotation_C(dst, i, 3 - i, false, range);
+  }
+  // stage 18.
+  if (size_log2 >= 3) {
+    ButterflyRotation_C(dst, 6, 5, 32, true, range);
+  }
+  // stage 19.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 8, MultiplyBy4(i) - j + 11,
+                           /*flip=*/i != 0, range);
+      }
+    }
+  }
+  // stage 20.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, 29 - i, i + 18, 48 + 64 * DivideBy2(i), true,
+                          range);
+    }
+  }
+  // stage 21.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        HadamardRotation_C(dst, MultiplyBy8(i) + j + 32,
+                           MultiplyBy8(i) - j + 39, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 22.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 4; ++i) {
+      HadamardRotation_C(dst, i, 7 - i, false, range);
+    }
+  }
+  // stage 23.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, 13 - i, i + 10, 32, true, range);
+    }
+  }
+  // stage 24.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        HadamardRotation_C(dst, MultiplyBy8(i) + j + 16,
+                           MultiplyBy8(i) - j + 23, i == 1, range);
+      }
+    }
+  }
+  // stage 25.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, 59 - i, i + 36, (i < 4) ? 48 : 112, true, range);
+    }
+  }
+  // stage 26.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, i, 15 - i, false, range);
+    }
+  }
+  // stage 27.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, 27 - i, i + 20, 32, true, range);
+    }
+  }
+  // stage 28.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, i + 32, 47 - i, false, range);
+      HadamardRotation_C(dst, i + 48, 63 - i, true, range);
+    }
+  }
+  // stage 29.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 16; ++i) {
+      HadamardRotation_C(dst, i, 31 - i, false, range);
+    }
+  }
+  // stage 30.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, 55 - i, i + 40, 32, true, range);
+    }
+  }
+  // stage 31.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 32; ++i) {
+      HadamardRotation_C(dst, i, 63 - i, false, range);
+    }
+  }
+}
+
+template <int bitdepth, typename Residual, int size_log2>
+void DctDcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                 bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row && should_round) {
+    dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+  }
+
+  ButterflyRotationSecondIsZero_C(dst, 0, 1, 32, true, range);
+
+  if (is_row && row_shift > 0) {
+    dst[0] = RightShiftWithRounding(dst[0], row_shift);
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 1);
+
+  const int size = 1 << size_log2;
+  for (int i = 1; i < size; ++i) {
+    dst[i] = dst[0];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+/*
+ * Row transform max range in bits for bitdepths 8/10/12: 28/30/32.
+ * Column transform max range in bits for bitdepths 8/10/12: 28/28/30.
+ */
+template <typename Residual>
+void Adst4_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  if ((dst[0] | dst[1] | dst[2] | dst[3]) == 0) {
+    return;
+  }
+
+  // stage 1.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+  // values stored in the s and x arrays by this process are representable by
+  // a signed integer using range + 12 bits of precision.
+  int32_t s[7];
+  s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+  s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
+  s[3] = RangeCheckValue(kAdst4Multiplier[3] * dst[2], range + 12);
+  s[4] = RangeCheckValue(kAdst4Multiplier[0] * dst[2], range + 12);
+  s[5] = RangeCheckValue(kAdst4Multiplier[1] * dst[3], range + 12);
+  s[6] = RangeCheckValue(kAdst4Multiplier[3] * dst[3], range + 12);
+  // stage 2.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that
+  // values stored in the variable a7 by this process are representable by a
+  // signed integer using range + 1 bits of precision.
+  const int32_t a7 = RangeCheckValue(dst[0] - dst[2], range + 1);
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that
+  // values stored in the variable b7 by this process are representable by a
+  // signed integer using |range| bits of precision.
+  const int32_t b7 = RangeCheckValue(a7 + dst[3], range);
+  // stage 3.
+  s[0] = RangeCheckValue(s[0] + s[3], range + 12);
+  s[1] = RangeCheckValue(s[1] - s[4], range + 12);
+  s[3] = s[2];
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * b7, range + 12);
+  // stage 4.
+  s[0] = RangeCheckValue(s[0] + s[5], range + 12);
+  s[1] = RangeCheckValue(s[1] - s[6], range + 12);
+  // stages 5 and 6.
+  const int32_t x0 = RangeCheckValue(s[0] + s[3], range + 12);
+  const int32_t x1 = RangeCheckValue(s[1] + s[3], range + 12);
+  int32_t x3 = RangeCheckValue(s[0] + s[1], range + 12);
+  x3 = RangeCheckValue(x3 - s[3], range + 12);
+  int32_t dst_0 = RightShiftWithRounding(x0, 12);
+  int32_t dst_1 = RightShiftWithRounding(x1, 12);
+  int32_t dst_2 = RightShiftWithRounding(s[2], 12);
+  int32_t dst_3 = RightShiftWithRounding(x3, 12);
+  if (sizeof(Residual) == 2) {
+    // If the first argument to RightShiftWithRounding(..., 12) is only
+    // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+    // in RightShiftWithRounding(..., 12) will cause the function to return
+    // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+    dst_0 -= (dst_0 == 0x8000);
+    dst_1 -= (dst_1 == 0x8000);
+    dst_3 -= (dst_3 == 0x8000);
+  }
+  dst[0] = dst_0;
+  dst[1] = dst_1;
+  dst[2] = dst_2;
+  dst[3] = dst_3;
+}
+
+template <int bitdepth, typename Residual>
+void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                   bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row && should_round) {
+    dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 1.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+  // values stored in the s and x arrays by this process are representable by
+  // a signed integer using range + 12 bits of precision.
+  int32_t s[3];
+  s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+  s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[0], range + 12);
+  // stage 3.
+  // stage 4.
+  // stages 5 and 6.
+  int32_t dst_0 = RightShiftWithRounding(s[0], 12);
+  int32_t dst_1 = RightShiftWithRounding(s[1], 12);
+  int32_t dst_2 = RightShiftWithRounding(s[2], 12);
+  int32_t dst_3 =
+      RightShiftWithRounding(RangeCheckValue(s[0] + s[1], range + 12), 12);
+  if (sizeof(Residual) == 2) {
+    // If the first argument to RightShiftWithRounding(..., 12) is only
+    // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+    // in RightShiftWithRounding(..., 12) will cause the function to return
+    // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+    dst_0 -= (dst_0 == 0x8000);
+    dst_1 -= (dst_1 == 0x8000);
+    dst_3 -= (dst_3 == 0x8000);
+  }
+  dst[0] = dst_0;
+  dst[1] = dst_1;
+  dst[2] = dst_2;
+  dst[3] = dst_3;
+
+  const int size = 4;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+template <typename Residual>
+void AdstInputPermutation(int32_t* const dst, const Residual* const src,
+                          int n) {
+  assert(n == 8 || n == 16);
+  for (int i = 0; i < n; ++i) {
+    dst[i] = src[((i & 1) == 0) ? n - i - 1 : i - 1];
+  }
+}
+
+constexpr int8_t kAdstOutputPermutationLookup[16] = {
+    0, 8, 12, 4, 6, 14, 10, 2, 3, 11, 15, 7, 5, 13, 9, 1};
+
+template <typename Residual>
+void AdstOutputPermutation(Residual* const dst, const int32_t* const src,
+                           int n) {
+  assert(n == 8 || n == 16);
+  const auto shift = static_cast<int8_t>(n == 8);
+  for (int i = 0; i < n; ++i) {
+    const int8_t index = kAdstOutputPermutationLookup[i] >> shift;
+    int32_t dst_i = ((i & 1) == 0) ? src[index] : -src[index];
+    if (sizeof(Residual) == 2) {
+      // If i is odd and src[index] is -32768, dst_i will be 32768, which
+      // cannot be represented as an int16_t.
+      dst_i -= (dst_i == 0x8000);
+    }
+    dst[i] = dst_i;
+  }
+}
+
+template <typename Residual>
+void Adst8_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  int32_t temp[8];
+  AdstInputPermutation(temp, dst, 8);
+  // stage 2.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 60 - 16 * i,
+                        true, range);
+  }
+  // stage 3.
+  for (int i = 0; i < 4; ++i) {
+    HadamardRotation_C(temp, i, i + 4, false, range);
+  }
+  // stage 4.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, i * 3 + 4, i + 5, 48 - 32 * i, true, range);
+  }
+  // stage 5.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+                         false, range);
+    }
+  }
+  // stage 6.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+  // stage 7.
+  AdstOutputPermutation(dst, temp, 8);
+}
+
+template <int bitdepth, typename Residual>
+void Adst8DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                   bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  // stage 1.
+  int32_t temp[8];
+  // After the permutation, the dc value is in temp[1]. The remaining are zero.
+  AdstInputPermutation(temp, dst, 8);
+
+  if (is_row && should_round) {
+    temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 2.
+  ButterflyRotationFirstIsZero_C(temp, 0, 1, 60, true, range);
+
+  // stage 3.
+  temp[4] = temp[0];
+  temp[5] = temp[1];
+
+  // stage 4.
+  ButterflyRotation_C(temp, 4, 5, 48, true, range);
+
+  // stage 5.
+  temp[2] = temp[0];
+  temp[3] = temp[1];
+  temp[6] = temp[4];
+  temp[7] = temp[5];
+
+  // stage 6.
+  ButterflyRotation_C(temp, 2, 3, 32, true, range);
+  ButterflyRotation_C(temp, 6, 7, 32, true, range);
+
+  // stage 7.
+  AdstOutputPermutation(dst, temp, 8);
+
+  const int size = 8;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 8);
+}
+
+template <typename Residual>
+void Adst16_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  int32_t temp[16];
+  AdstInputPermutation(temp, dst, 16);
+  // stage 2.
+  for (int i = 0; i < 8; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 62 - 8 * i,
+                        true, range);
+  }
+  // stage 3.
+  for (int i = 0; i < 8; ++i) {
+    HadamardRotation_C(temp, i, i + 8, false, range);
+  }
+  // stage 4.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+                        56 - 32 * i, true, range);
+    ButterflyRotation_C(temp, MultiplyBy2(i) + 13, MultiplyBy2(i) + 12,
+                        8 + 32 * i, true, range);
+  }
+  // stage 5.
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy8(j), i + MultiplyBy8(j) + 4,
+                         false, range);
+    }
+  }
+  // stage 6.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ButterflyRotation_C(temp, i * 3 + MultiplyBy8(j) + 4,
+                          i + MultiplyBy8(j) + 5, 48 - 32 * i, true, range);
+    }
+  }
+  // stage 7.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+                         false, range);
+    }
+  }
+  // stage 8.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+  // stage 9.
+  AdstOutputPermutation(dst, temp, 16);
+}
+
+template <int bitdepth, typename Residual>
+void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                    bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  // stage 1.
+  int32_t temp[16];
+  // After the permutation, the dc value is in temp[1].  The remaining are zero.
+  AdstInputPermutation(temp, dst, 16);
+
+  if (is_row && should_round) {
+    temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 2.
+  ButterflyRotationFirstIsZero_C(temp, 0, 1, 62, true, range);
+
+  // stage 3.
+  temp[8] = temp[0];
+  temp[9] = temp[1];
+
+  // stage 4.
+  ButterflyRotation_C(temp, 8, 9, 56, true, range);
+
+  // stage 5.
+  temp[4] = temp[0];
+  temp[5] = temp[1];
+  temp[12] = temp[8];
+  temp[13] = temp[9];
+
+  // stage 6.
+  ButterflyRotation_C(temp, 4, 5, 48, true, range);
+  ButterflyRotation_C(temp, 12, 13, 48, true, range);
+
+  // stage 7.
+  temp[2] = temp[0];
+  temp[3] = temp[1];
+  temp[10] = temp[8];
+  temp[11] = temp[9];
+
+  temp[6] = temp[4];
+  temp[7] = temp[5];
+  temp[14] = temp[12];
+  temp[15] = temp[13];
+
+  // stage 8.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+
+  // stage 9.
+  AdstOutputPermutation(dst, temp, 16);
+
+  const int size = 16;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 16);
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+//
+// In the spec, the inverse identity transform is followed by a Round2() call:
+//   The row transforms with i = 0..(h-1) are applied as follows:
+//     ...
+//     * Otherwise, invoke the inverse identity transform process specified in
+//       section 7.13.2.15 with the input variable n equal to log2W.
+//     * Set Residual[ i ][ j ] equal to Round2( T[ j ], rowShift )
+//       for j = 0..(w-1).
+//   ...
+//   The column transforms with j = 0..(w-1) are applied as follows:
+//     ...
+//     * Otherwise, invoke the inverse identity transform process specified in
+//       section 7.13.2.15 with the input variable n equal to log2H.
+//     * Residual[ i ][ j ] is set equal to Round2( T[ i ], colShift )
+//       for i = 0..(h-1).
+//
+// Therefore, we define the identity transform functions to perform both the
+// inverse identity transform and the Round2() call. This has two advantages:
+// 1. The outputs of the inverse identity transform do not need to be stored
+//    in the Residual array. They can be stored in int32_t local variables,
+//    which have a larger range if Residual is an int16_t array.
+// 2. The inverse identity transform and the Round2() call can be jointly
+//    optimized.
+//
+// The identity transform functions have the following prototype:
+//   void Identity_C(void* dest, int8_t shift);
+//
+// The |shift| parameter is the amount of shift for the Round2() call. For row
+// transforms, |shift| is 0, 1, or 2. For column transforms, |shift| is always
+// 4. Therefore, an identity transform function can detect whether it is being
+// invoked as a row transform or a column transform by checking whether |shift|
+// is equal to 4.
+//
+// Input Range
+//
+// The inputs of row transforms, stored in the 2D array Dequant, are
+// representable by a signed integer using 8 + BitDepth bits of precision:
+//   f. Dequant[ i ][ j ] is set equal to
+//   Clip3( - ( 1 << ( 7 + BitDepth ) ), ( 1 << ( 7 + BitDepth ) ) - 1, dq2 ).
+//
+// The inputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) bits of precision:
+//   Set the variable colClampRange equal to Max( BitDepth + 6, 16 ).
+//   ...
+//   Between the row and column transforms, Residual[ i ][ j ] is set equal to
+//   Clip3( - ( 1 << ( colClampRange - 1 ) ),
+//          ( 1 << (colClampRange - 1 ) ) - 1,
+//          Residual[ i ][ j ] )
+//   for i = 0..(h-1), for j = 0..(w-1).
+//
+// Output Range
+//
+// The outputs of row transforms are representable by a signed integer using
+// 8 + BitDepth + 1 = 9 + BitDepth bits of precision, because the net effect
+// of the multiplicative factor of inverse identity transforms minus the
+// smallest row shift is an increase of at most one bit.
+//
+// Transform | Multiplicative factor | Smallest row | Net increase
+// width     | (in bits)             | shift        | in bits
+// ---------------------------------------------------------------
+//     4     |  sqrt(2)  (0.5 bits)  |      0       |    +0.5
+//     8     |     2     (1 bit)     |      0       |    +1
+//    16     | 2*sqrt(2) (1.5 bits)  |      1       |    +0.5
+//    32     |     4     (2 bits)    |      1       |    +1
+//
+// If BitDepth is 8 and Residual is an int16_t array, to avoid truncation we
+// clip the outputs (which have 17 bits of precision) to the range of int16_t
+// before storing them in the Residual array. This clipping happens to be the
+// same as the required clipping after the row transform (see the spec quoted
+// above), so we remain compliant with the spec. (In this case,
+// TransformLoop_C() skips clipping the outputs of row transforms to avoid
+// duplication of effort.)
+//
+// The outputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) + 2 - 4 = Max( BitDepth + 4, 14 ) bits of precision,
+// because the multiplicative factor of inverse identity transforms is at most
+// 4 (2 bits) and |shift| is always 4.
+
+template <typename Residual>
+void Identity4Row_C(void* dest, int8_t shift) {
+  assert(shift == 0 || shift == 1);
+  auto* const dst = static_cast<Residual*>(dest);
+  // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
+  // should be (1 + (1 << 1)) << 11. The following expression works for both
+  // values of |shift|.
+  const int32_t rounding = (1 + (shift << 1)) << 11;
+  for (int i = 0; i < 4; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity4Multiplier as int32_t.
+    int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity4Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  for (int i = 0; i < 4; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity4Multiplier as int32_t.
+    dst[i] = static_cast<Residual>((dst[i] * kIdentity4Multiplier + rounding) >>
+                                   (12 + kTransformColumnShift));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                       int row_shift, bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+    }
+
+    const int32_t rounding = (1 + (row_shift << 1)) << 11;
+    int32_t dst_i =
+        (dst[0] * kIdentity4Multiplier + rounding) >> (12 + row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  dst[0] = static_cast<Residual>((dst[0] * kIdentity4Multiplier + rounding) >>
+                                 (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity8Row_C(void* dest, int8_t shift) {
+  assert(shift == 0 || shift == 1 || shift == 2);
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 8; ++i) {
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[i]), shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity8Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 8; ++i) {
+    dst[i] = static_cast<Residual>(
+        RightShiftWithRounding(dst[i], kTransformColumnShift - 1));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                       int row_shift, bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+    }
+
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+    // clip residual[i][j] to 16 bits.
+    if (sizeof(Residual) > 2) {
+      const Residual intermediate_clamp_max =
+          (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+      const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+      dst[0] = Clip3(dst[0], intermediate_clamp_min, intermediate_clamp_max);
+    }
+    return;
+  }
+
+  dst[0] = static_cast<Residual>(
+      RightShiftWithRounding(dst[0], kTransformColumnShift - 1));
+}
+
+template <typename Residual>
+void Identity16Row_C(void* dest, int8_t shift) {
+  assert(shift == 1 || shift == 2);
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << shift)) << 11;
+  for (int i = 0; i < 16; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity16Multiplier as int32_t.
+    int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity16Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  for (int i = 0; i < 16; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity16Multiplier as int32_t.
+    dst[i] =
+        static_cast<Residual>((dst[i] * kIdentity16Multiplier + rounding) >>
+                              (12 + kTransformColumnShift));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                        int row_shift, bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+    }
+
+    const int32_t rounding = (1 + (1 << row_shift)) << 11;
+    int32_t dst_i =
+        (dst[0] * kIdentity16Multiplier + rounding) >> (12 + row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  dst[0] = static_cast<Residual>((dst[0] * kIdentity16Multiplier + rounding) >>
+                                 (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity32Row_C(void* dest, int8_t shift) {
+  assert(shift == 1 || shift == 2);
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 32; ++i) {
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[i]), shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity32Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 32; ++i) {
+    dst[i] = static_cast<Residual>(
+        RightShiftWithRounding(dst[i], kTransformColumnShift - 2));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                        int row_shift, bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+    }
+
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  dst[0] = static_cast<Residual>(
+      RightShiftWithRounding(dst[0], kTransformColumnShift - 2));
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+template <typename Residual>
+void Wht4_C(void* dest, int8_t shift) {
+  auto* const dst = static_cast<Residual*>(dest);
+  Residual temp[4];
+  temp[0] = dst[0] >> shift;
+  temp[2] = dst[1] >> shift;
+  temp[3] = dst[2] >> shift;
+  temp[1] = dst[3] >> shift;
+  temp[0] += temp[2];
+  temp[3] -= temp[1];
+  // This signed right shift must be an arithmetic shift.
+  Residual e = (temp[0] - temp[3]) >> 1;
+  dst[1] = e - temp[1];
+  dst[2] = e - temp[2];
+  dst[0] = temp[0] - dst[1];
+  dst[3] = temp[3] + dst[2];
+}
+
+template <int bitdepth, typename Residual>
+void Wht4DcOnly_C(void* dest, int8_t range, bool /*should_round*/,
+                  int /*row_shift*/, bool /*is_row*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int shift = range;
+
+  Residual temp = dst[0] >> shift;
+  // This signed right shift must be an arithmetic shift.
+  Residual e = temp >> 1;
+  dst[0] = temp - e;
+  dst[1] = e;
+  dst[2] = e;
+  dst[3] = e;
+
+  ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loop
+
+using InverseTransform1DFunc = void (*)(void* dst, int8_t range);
+using InverseTransformDcOnlyFunc = void (*)(void* dest, int8_t range,
+                                            bool should_round, int row_shift,
+                                            bool is_row);
+
+template <int bitdepth, typename Residual, typename Pixel,
+          Transform1D transform1d_type,
+          InverseTransformDcOnlyFunc dconly_transform1d,
+          InverseTransform1DFunc transform1d_func, bool is_row>
+void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
+                     int adjusted_tx_height, void* src_buffer, int start_x,
+                     int start_y, void* dst_frame) {
+  constexpr bool lossless = transform1d_type == k1DTransformWht;
+  constexpr bool is_identity = transform1d_type == k1DTransformIdentity;
+  // The transform size of the WHT is always 4x4. Setting tx_width and
+  // tx_height to the constant 4 for the WHT speeds the code up.
+  assert(!lossless || tx_size == kTransformSize4x4);
+  const int tx_width = lossless ? 4 : kTransformWidth[tx_size];
+  const int tx_height = lossless ? 4 : kTransformHeight[tx_size];
+  const int tx_width_log2 = kTransformWidthLog2[tx_size];
+  const int tx_height_log2 = kTransformHeightLog2[tx_size];
+  auto* frame = static_cast<Array2DView<Pixel>*>(dst_frame);
+
+  // Initially this points to the dequantized values. After the transforms are
+  // applied, this buffer contains the residual.
+  Array2DView<Residual> residual(tx_height, tx_width,
+                                 static_cast<Residual*>(src_buffer));
+
+  if (is_row) {
+    // Row transform.
+    const uint8_t row_shift = lossless ? 0 : kTransformRowShift[tx_size];
+    // This is the |range| parameter of the InverseTransform1DFunc.  For lossy
+    // transforms, this will be equal to the clamping range.
+    const int8_t row_clamp_range = lossless ? 2 : (bitdepth + 8);
+    // If the width:height ratio of the transform size is 2:1 or 1:2, multiply
+    // the input to the row transform by 1 / sqrt(2), which is approximated by
+    // the fraction 2896 / 2^12.
+    const bool should_round = std::abs(tx_width_log2 - tx_height_log2) == 1;
+
+    if (adjusted_tx_height == 1) {
+      dconly_transform1d(residual[0], row_clamp_range, should_round, row_shift,
+                         true);
+      return;
+    }
+
+    // Row transforms need to be done only up to 32 because the rest of the rows
+    // are always all zero if |tx_height| is 64.  Otherwise, only process the
+    // rows that have a non zero coefficients.
+    for (int i = 0; i < adjusted_tx_height; ++i) {
+      // If lossless, the transform size is 4x4, so should_round is false.
+      if (!lossless && should_round) {
+        // The last 32 values of every row are always zero if the |tx_width| is
+        // 64.
+        for (int j = 0; j < std::min(tx_width, 32); ++j) {
+          residual[i][j] = RightShiftWithRounding(
+              residual[i][j] * kTransformRowMultiplier, 12);
+        }
+      }
+      // For identity transform, |transform1d_func| also performs the
+      // Round2(T[j], rowShift) call in the spec.
+      transform1d_func(residual[i], is_identity ? row_shift : row_clamp_range);
+      if (!lossless && !is_identity && row_shift > 0) {
+        for (int j = 0; j < tx_width; ++j) {
+          residual[i][j] = RightShiftWithRounding(residual[i][j], row_shift);
+        }
+      }
+
+      ClampIntermediate<bitdepth, Residual>(residual[i], tx_width);
+    }
+    return;
+  }
+
+  assert(!is_row);
+  constexpr uint8_t column_shift = lossless ? 0 : kTransformColumnShift;
+  // This is the |range| parameter of the InverseTransform1DFunc.  For lossy
+  // transforms, this will be equal to the clamping range.
+  const int8_t column_clamp_range = lossless ? 0 : std::max(bitdepth + 6, 16);
+  const bool flip_rows = transform1d_type == k1DTransformAdst &&
+                         kTransformFlipRowsMask.Contains(tx_type);
+  const bool flip_columns =
+      !lossless && kTransformFlipColumnsMask.Contains(tx_type);
+  const int min_value = 0;
+  const int max_value = (1 << bitdepth) - 1;
+  // Note: 64 is the maximum size of a 1D transform buffer (the largest
+  // transform size is kTransformSize64x64).
+  Residual tx_buffer[64];
+  for (int j = 0; j < tx_width; ++j) {
+    const int flipped_j = flip_columns ? tx_width - j - 1 : j;
+    for (int i = 0; i < tx_height; ++i) {
+      tx_buffer[i] = residual[i][flipped_j];
+    }
+    if (adjusted_tx_height == 1) {
+      dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
+    } else {
+      // For identity transform, |transform1d_func| also performs the
+      // Round2(T[i], colShift) call in the spec.
+      transform1d_func(tx_buffer,
+                       is_identity ? column_shift : column_clamp_range);
+    }
+    const int x = start_x + j;
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int index = flip_rows ? tx_height - i - 1 : i;
+      Residual residual_value = tx_buffer[index];
+      if (!lossless && !is_identity) {
+        residual_value = RightShiftWithRounding(residual_value, column_shift);
+      }
+      (*frame)[y][x] =
+          Clip3((*frame)[y][x] + residual_value, min_value, max_value);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+template <int bitdepth, typename Residual, typename Pixel>
+void InitAll(Dsp* const dsp) {
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+                      /*is_row=*/false>;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+                      /*is_row=*/false>;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity4DcOnly_C<bitdepth, Residual>,
+                      Identity4Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity4DcOnly_C<bitdepth, Residual>,
+                      Identity4Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity8DcOnly_C<bitdepth, Residual>,
+                      Identity8Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity8DcOnly_C<bitdepth, Residual>,
+                      Identity8Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity16DcOnly_C<bitdepth, Residual>,
+                      Identity16Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity16DcOnly_C<bitdepth, Residual>,
+                      Identity16Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity32DcOnly_C<bitdepth, Residual>,
+                      Identity32Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity32DcOnly_C<bitdepth, Residual>,
+                      Identity32Column_C<Residual>, /*is_row=*/false>;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformWht,
+                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformWht,
+                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+                      /*is_row=*/false>;
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  for (auto& inverse_transform_by_size : dsp->inverse_transforms) {
+    for (auto& inverse_transform : inverse_transform_by_size) {
+      inverse_transform[kRow] = nullptr;
+      inverse_transform[kColumn] = nullptr;
+    }
+  }
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<8, int16_t, uint8_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity4DcOnly_C<8, int16_t>, Identity4Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity4DcOnly_C<8, int16_t>, Identity4Column_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity8DcOnly_C<8, int16_t>, Identity8Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity8DcOnly_C<8, int16_t>, Identity8Column_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity16DcOnly_C<8, int16_t>, Identity16Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity16DcOnly_C<8, int16_t>,
+                      Identity16Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity32DcOnly_C<8, int16_t>, Identity32Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity32DcOnly_C<8, int16_t>,
+                      Identity32Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht,
+                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht,
+                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  for (auto& inverse_transform_by_size : dsp->inverse_transforms) {
+    for (auto& inverse_transform : inverse_transform_by_size) {
+      inverse_transform[kRow] = nullptr;
+      inverse_transform[kColumn] = nullptr;
+    }
+  }
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<10, int32_t, uint16_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity4DcOnly_C<10, int32_t>, Identity4Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity4DcOnly_C<10, int32_t>,
+                      Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity8DcOnly_C<10, int32_t>, Identity8Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity8DcOnly_C<10, int32_t>,
+                      Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity16DcOnly_C<10, int32_t>, Identity16Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity16DcOnly_C<10, int32_t>,
+                      Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformIdentity
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity32DcOnly_C<10, int32_t>, Identity32Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity32DcOnly_C<10, int32_t>,
+                      Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformWht
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht,
+                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht,
+                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void InverseTransformInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+
+  // Local functions that may be unused depending on the optimizations
+  // available.
+  static_cast<void>(RangeCheckValue);
+  static_cast<void>(kBitReverseLookup);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/inverse_transform.h b/src/dsp/inverse_transform.h
new file mode 100644
index 0000000..0916665
--- /dev/null
+++ b/src/dsp/inverse_transform.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+#define LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/inverse_transform_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/inverse_transform_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms. This function is not thread-safe.
+void InverseTransformInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
diff --git a/src/dsp/inverse_transform.inc b/src/dsp/inverse_transform.inc
new file mode 100644
index 0000000..55e68b6
--- /dev/null
+++ b/src/dsp/inverse_transform.inc
@@ -0,0 +1,64 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for inverse transform implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// The value at index i is derived as: round(cos(pi * i / 128) * (1 << 12)).
+constexpr int16_t kCos128[65] = {
+    4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101,  0};
+
+inline int16_t Cos128(int angle) {
+  angle &= 0xff;
+
+  // If |angle| is 128, this function returns -4096 (= -2^12), which will
+  // cause the 32-bit multiplications in ButterflyRotation() to overflow if
+  // dst[a] or dst[b] is -2^19 (a possible corner case when |range| is 20):
+  //
+  //   (-2^12) * (-2^19) = 2^31, which cannot be represented as an int32_t.
+  //
+  // Note: |range| is 20 when bitdepth is 12 and a row transform is performed.
+  //
+  // Assert that this angle is never used by DCT or ADST.
+  assert(angle != 128);
+  if (angle <= 64) return kCos128[angle];
+  if (angle <= 128) return -kCos128[128 - angle];
+  if (angle <= 192) return -kCos128[angle - 128];
+  return kCos128[256 - angle];
+}
+
+inline int16_t Sin128(int angle) { return Cos128(angle - 64); }
+
+// The value for index i is derived as:
+// round(sqrt(2) * sin(i * pi / 9) * 2 / 3 * (1 << 12)).
+constexpr int16_t kAdst4Multiplier[4] = {1321, 2482, 3344, 3803};
+
+constexpr uint8_t kTransformRowShift[kNumTransformSizes] = {
+    0, 0, 1, 0, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2};
+
+constexpr bool kShouldRound[kNumTransformSizes] = {
+    false, true,  false, true, false, true, false, false, true, false,
+    true,  false, false, true, false, true, false, true,  false};
+
+constexpr int16_t kIdentity4Multiplier /* round(2^12 * sqrt(2)) */ = 0x16A1;
+constexpr int16_t kIdentity4MultiplierFraction /* round(2^12 * (sqrt(2) - 1))*/
+    = 0x6A1;
+constexpr int16_t kIdentity16Multiplier /* 2 * round(2^12 * sqrt(2)) */ = 11586;
+constexpr int16_t kTransformRowMultiplier /* round(2^12 / sqrt(2)) */ = 2896;
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
new file mode 100644
index 0000000..960d5a7
--- /dev/null
+++ b/src/dsp/libgav1_dsp.cmake
@@ -0,0 +1,176 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_)
+  return()
+endif() # LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_
+set(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_ 1)
+
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+
+list(APPEND libgav1_dsp_sources
+            "${libgav1_source}/dsp/average_blend.cc"
+            "${libgav1_source}/dsp/average_blend.h"
+            "${libgav1_source}/dsp/cdef.cc"
+            "${libgav1_source}/dsp/cdef.h"
+            "${libgav1_source}/dsp/cdef.inc"
+            "${libgav1_source}/dsp/common.h"
+            "${libgav1_source}/dsp/constants.cc"
+            "${libgav1_source}/dsp/constants.h"
+            "${libgav1_source}/dsp/convolve.cc"
+            "${libgav1_source}/dsp/convolve.h"
+            "${libgav1_source}/dsp/convolve.inc"
+            "${libgav1_source}/dsp/distance_weighted_blend.cc"
+            "${libgav1_source}/dsp/distance_weighted_blend.h"
+            "${libgav1_source}/dsp/dsp.cc"
+            "${libgav1_source}/dsp/dsp.h"
+            "${libgav1_source}/dsp/film_grain.cc"
+            "${libgav1_source}/dsp/film_grain.h"
+            "${libgav1_source}/dsp/film_grain_common.h"
+            "${libgav1_source}/dsp/intra_edge.cc"
+            "${libgav1_source}/dsp/intra_edge.h"
+            "${libgav1_source}/dsp/intrapred.cc"
+            "${libgav1_source}/dsp/intrapred.h"
+            "${libgav1_source}/dsp/inverse_transform.cc"
+            "${libgav1_source}/dsp/inverse_transform.h"
+            "${libgav1_source}/dsp/inverse_transform.inc"
+            "${libgav1_source}/dsp/loop_filter.cc"
+            "${libgav1_source}/dsp/loop_filter.h"
+            "${libgav1_source}/dsp/loop_restoration.cc"
+            "${libgav1_source}/dsp/loop_restoration.h"
+            "${libgav1_source}/dsp/mask_blend.cc"
+            "${libgav1_source}/dsp/mask_blend.h"
+            "${libgav1_source}/dsp/motion_field_projection.cc"
+            "${libgav1_source}/dsp/motion_field_projection.h"
+            "${libgav1_source}/dsp/motion_vector_search.cc"
+            "${libgav1_source}/dsp/motion_vector_search.h"
+            "${libgav1_source}/dsp/obmc.cc"
+            "${libgav1_source}/dsp/obmc.h"
+            "${libgav1_source}/dsp/obmc.inc"
+            "${libgav1_source}/dsp/super_res.cc"
+            "${libgav1_source}/dsp/super_res.h"
+            "${libgav1_source}/dsp/warp.cc"
+            "${libgav1_source}/dsp/warp.h"
+            "${libgav1_source}/dsp/weight_mask.cc"
+            "${libgav1_source}/dsp/weight_mask.h")
+
+list(APPEND libgav1_dsp_sources_avx2
+            ${libgav1_dsp_sources_avx2}
+            "${libgav1_source}/dsp/x86/convolve_avx2.cc"
+            "${libgav1_source}/dsp/x86/convolve_avx2.h"
+            "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_avx2.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_avx2.h")
+
+list(APPEND libgav1_dsp_sources_neon
+            ${libgav1_dsp_sources_neon}
+            "${libgav1_source}/dsp/arm/average_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/average_blend_neon.h"
+            "${libgav1_source}/dsp/arm/cdef_neon.cc"
+            "${libgav1_source}/dsp/arm/cdef_neon.h"
+            "${libgav1_source}/dsp/arm/common_neon.h"
+            "${libgav1_source}/dsp/arm/convolve_neon.cc"
+            "${libgav1_source}/dsp/arm/convolve_neon.h"
+            "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.h"
+            "${libgav1_source}/dsp/arm/film_grain_neon.cc"
+            "${libgav1_source}/dsp/arm/film_grain_neon.h"
+            "${libgav1_source}/dsp/arm/intra_edge_neon.cc"
+            "${libgav1_source}/dsp/arm/intra_edge_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_intra_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+            "${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
+            "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
+            "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_filter_neon.h"
+            "${libgav1_source}/dsp/arm/loop_restoration_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_restoration_neon.h"
+            "${libgav1_source}/dsp/arm/mask_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/mask_blend_neon.h"
+            "${libgav1_source}/dsp/arm/motion_field_projection_neon.cc"
+            "${libgav1_source}/dsp/arm/motion_field_projection_neon.h"
+            "${libgav1_source}/dsp/arm/motion_vector_search_neon.cc"
+            "${libgav1_source}/dsp/arm/motion_vector_search_neon.h"
+            "${libgav1_source}/dsp/arm/obmc_neon.cc"
+            "${libgav1_source}/dsp/arm/obmc_neon.h"
+            "${libgav1_source}/dsp/arm/super_res_neon.cc"
+            "${libgav1_source}/dsp/arm/super_res_neon.h"
+            "${libgav1_source}/dsp/arm/warp_neon.cc"
+            "${libgav1_source}/dsp/arm/warp_neon.h"
+            "${libgav1_source}/dsp/arm/weight_mask_neon.cc"
+            "${libgav1_source}/dsp/arm/weight_mask_neon.h")
+
+list(APPEND libgav1_dsp_sources_sse4
+            ${libgav1_dsp_sources_sse4}
+            "${libgav1_source}/dsp/x86/average_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/average_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/common_sse4.h"
+            "${libgav1_source}/dsp/x86/cdef_sse4.cc"
+            "${libgav1_source}/dsp/x86/cdef_sse4.h"
+            "${libgav1_source}/dsp/x86/convolve_sse4.cc"
+            "${libgav1_source}/dsp/x86/convolve_sse4.h"
+            "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
+            "${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+            "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
+            "${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
+            "${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_filter_sse4.h"
+            "${libgav1_source}/dsp/x86/loop_restoration_10bit_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
+            "${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/mask_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc"
+            "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h"
+            "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc"
+            "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h"
+            "${libgav1_source}/dsp/x86/obmc_sse4.cc"
+            "${libgav1_source}/dsp/x86/obmc_sse4.h"
+            "${libgav1_source}/dsp/x86/super_res_sse4.cc"
+            "${libgav1_source}/dsp/x86/super_res_sse4.h"
+            "${libgav1_source}/dsp/x86/transpose_sse4.h"
+            "${libgav1_source}/dsp/x86/warp_sse4.cc"
+            "${libgav1_source}/dsp/x86/warp_sse4.h"
+            "${libgav1_source}/dsp/x86/weight_mask_sse4.cc"
+            "${libgav1_source}/dsp/x86/weight_mask_sse4.h")
+
+macro(libgav1_add_dsp_targets)
+  unset(dsp_sources)
+  list(APPEND dsp_sources ${libgav1_dsp_sources}
+              ${libgav1_dsp_sources_neon}
+              ${libgav1_dsp_sources_avx2}
+              ${libgav1_dsp_sources_sse4})
+
+  libgav1_add_library(NAME
+                      libgav1_dsp
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${dsp_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      $<$<CONFIG:Debug>:LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS>
+                      INCLUDES
+                      ${libgav1_include_paths})
+endmacro()
diff --git a/src/dsp/loop_filter.cc b/src/dsp/loop_filter.cc
new file mode 100644
index 0000000..6cad97d
--- /dev/null
+++ b/src/dsp/loop_filter.cc
@@ -0,0 +1,616 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// 7.14.6.1.
+template <int bitdepth, typename Pixel>
+struct LoopFilterFuncs_C {
+  LoopFilterFuncs_C() = delete;
+
+  static constexpr int kMaxPixel = (1 << bitdepth) - 1;
+  static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1));
+  static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
+  static constexpr int kFlatThresh = 1 << (bitdepth - 8);
+
+  static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                         int inner_thresh, int hev_thresh);
+  static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                           int inner_thresh, int hev_thresh);
+};
+
+inline void AdjustThresholds(const int bitdepth, int* const outer_thresh,
+                             int* const inner_thresh, int* const hev_thresh) {
+  *outer_thresh <<= bitdepth - 8;
+  *inner_thresh <<= bitdepth - 8;
+  *hev_thresh <<= bitdepth - 8;
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter4(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step];
+  return std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool Hev(const Pixel* p, ptrdiff_t step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (std::abs(p1 - p0) > thresh) || (std::abs(q1 - q0) > thresh);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 2 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter2_C(Pixel* p, ptrdiff_t step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int min_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+  const int max_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+  // 8bpp: [-893,892], 10bpp: [-3581,3580], 12bpp [-14333,14332]
+  const int a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // 8bpp: [-16,15], 10bpp: [-64,63], 12bpp: [-256,255]
+  const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+  const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+  p[-step] = Clip3(p0 + a2, 0, max_unsigned_val);
+  p[0] = Clip3(q0 - a1, 0, max_unsigned_val);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 4 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter4_C(Pixel* p, ptrdiff_t step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int min_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+  const int max_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+  const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+  const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int a3 = (a1 + 1) >> 1;
+  const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+  p[-2 * step] = Clip3(p1 + a3, 0, max_unsigned_val);
+  p[-1 * step] = Clip3(p0 + a2, 0, max_unsigned_val);
+  p[0 * step] = Clip3(q0 - a1, 0, max_unsigned_val);
+  p[1 * step] = Clip3(q1 - a3, 0, max_unsigned_val);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical4(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter4(dst, 1, outer_thresh, inner_thresh)) {
+      if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal4(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter4(dst, stride, outer_thresh, inner_thresh)) {
+      if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter6(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  return std::abs(p2 - p1) <= inner_thresh &&
+         std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(q2 - q1) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat3(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+         std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter6(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 6 pixels in, 4 pixels out.
+template <typename Pixel>
+inline void Filter6_C(Pixel* p, ptrdiff_t step) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  const int a1 = 2 * p1;
+  const int a0 = 2 * p0;
+  const int b0 = 2 * q0;
+  const int b1 = 2 * q1;
+  // The max is 8 * max_pixel + 4 for the rounder.
+  // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+  p[-2 * step] = ApplyFilter6<Pixel>(3 * p2 + a1 + a0 + q0);
+  p[-1 * step] = ApplyFilter6<Pixel>(p2 + a1 + a0 + b0 + q1);
+  p[0 * step] = ApplyFilter6<Pixel>(p1 + a0 + b0 + b1 + q2);
+  p[1 * step] = ApplyFilter6<Pixel>(p0 + b0 + b1 + 3 * q2);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical6(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter6(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat3(dst, 1, flat_thresh)) {
+        Filter6_C(dst, 1);
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal6(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter6(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat3(dst, stride, flat_thresh)) {
+        Filter6_C(dst, stride);
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter8(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  return std::abs(p3 - p2) <= inner_thresh &&
+         std::abs(p2 - p1) <= inner_thresh &&
+         std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(q2 - q1) <= inner_thresh &&
+         std::abs(q3 - q2) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+         std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh &&
+         std::abs(p3 - p0) <= flat_thresh && std::abs(q3 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter8(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 8 pixels in, 6 pixels out.
+template <typename Pixel>
+inline void Filter8_C(Pixel* p, ptrdiff_t step) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  // The max is 8 * max_pixel + 4 for the rounder.
+  // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+  p[-3 * step] = ApplyFilter8<Pixel>(3 * p3 + 2 * p2 + p1 + p0 + q0);
+  p[-2 * step] = ApplyFilter8<Pixel>(2 * p3 + p2 + 2 * p1 + p0 + q0 + q1);
+  p[-1 * step] = ApplyFilter8<Pixel>(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2);
+  p[0 * step] = ApplyFilter8<Pixel>(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3);
+  p[1 * step] = ApplyFilter8<Pixel>(p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3);
+  p[2 * step] = ApplyFilter8<Pixel>(p0 + q0 + q1 + 2 * q2 + 3 * q3);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical8(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, 1, flat_thresh)) {
+        Filter8_C(dst, 1);
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal8(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, stride, flat_thresh)) {
+        Filter8_C(dst, stride);
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlatOuter4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+  return std::abs(p4 - p0) <= flat_thresh && std::abs(q4 - q0) <= flat_thresh &&
+         std::abs(p5 - p0) <= flat_thresh && std::abs(q5 - q0) <= flat_thresh &&
+         std::abs(p6 - p0) <= flat_thresh && std::abs(q6 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter14(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 4));
+}
+
+// 7.14.6.4.
+// 14 pixels in, 12 pixels out.
+template <typename Pixel>
+inline void Filter14_C(Pixel* p, ptrdiff_t step) {
+  const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+            p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step],
+            q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+  // The max is 16 * max_pixel + 8 for the rounder.
+  // 8bpp: 4088 (12 bits), 10bpp: 16376 (14 bits), 12bpp: 65528 (16 bits)
+  p[-6 * step] =
+      ApplyFilter14<Pixel>(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0);
+  p[-5 * step] = ApplyFilter14<Pixel>(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 +
+                                      p1 + p0 + q0 + q1);
+  p[-4 * step] = ApplyFilter14<Pixel>(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 +
+                                      p1 + p0 + q0 + q1 + q2);
+  p[-3 * step] = ApplyFilter14<Pixel>(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 +
+                                      p1 * 2 + p0 + q0 + q1 + q2 + q3);
+  p[-2 * step] = ApplyFilter14<Pixel>(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+                                      p0 * 2 + q0 + q1 + q2 + q3 + q4);
+  p[-1 * step] = ApplyFilter14<Pixel>(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                      q0 * 2 + q1 + q2 + q3 + q4 + q5);
+  p[0 * step] = ApplyFilter14<Pixel>(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                     q1 * 2 + q2 + q3 + q4 + q5 + q6);
+  p[1 * step] = ApplyFilter14<Pixel>(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                     q2 * 2 + q3 + q4 + q5 + q6 * 2);
+  p[2 * step] = ApplyFilter14<Pixel>(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+                                     q3 * 2 + q4 + q5 + q6 * 3);
+  p[3 * step] = ApplyFilter14<Pixel>(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+                                     q4 * 2 + q5 + q6 * 4);
+  p[4 * step] = ApplyFilter14<Pixel>(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+                                     q5 * 2 + q6 * 5);
+  p[5 * step] =
+      ApplyFilter14<Pixel>(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical14(void* dest,
+                                                    ptrdiff_t stride,
+                                                    int outer_thresh,
+                                                    int inner_thresh,
+                                                    int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, 1, flat_thresh)) {
+        if (IsFlatOuter4(dst, 1, flat_thresh)) {
+          Filter14_C(dst, 1);
+        } else {
+          Filter8_C(dst, 1);
+        }
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal14(void* dest,
+                                                      ptrdiff_t stride,
+                                                      int outer_thresh,
+                                                      int inner_thresh,
+                                                      int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, stride, flat_thresh)) {
+        if (IsFlatOuter4(dst, stride, flat_thresh)) {
+          Filter14_C(dst, stride);
+        } else {
+          Filter8_C(dst, stride);
+        }
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+using Defs8bpp = LoopFilterFuncs_C<8, uint8_t>;
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal4;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical4;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal6;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical6;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal8;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical8;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal14;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical14;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical14;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using Defs10bpp = LoopFilterFuncs_C<10, uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void LoopFilterInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+  // Local functions that may be unused depending on the optimizations
+  // available.
+  static_cast<void>(AdjustThresholds);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_filter.h b/src/dsp/loop_filter.h
new file mode 100644
index 0000000..1ddad71
--- /dev/null
+++ b/src/dsp/loop_filter.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+#define LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters. This function is not thread-safe.
+void LoopFilterInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_LOOP_FILTER_H_
diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc
new file mode 100644
index 0000000..0909df0
--- /dev/null
+++ b/src/dsp/loop_restoration.cc
@@ -0,0 +1,936 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Section 7.17.3.
+// a2: range [1, 256].
+// if (z >= 255)
+//   a2 = 256;
+// else if (z == 0)
+//   a2 = 1;
+// else
+//   a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
+// ma = 256 - a2;
+alignas(16) const uint8_t kSgrMaLookup[256] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
+    13,  13,  12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,  8,  8,  7,  7,
+    7,   7,   7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,   5,   4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  0};
+
+namespace {
+
+template <int bitdepth, typename Pixel>
+inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
+                             const int width, const int height,
+                             const int16_t* const filter,
+                             const int number_zero_coefficients,
+                             int16_t** wiener_buffer) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int offset =
+      1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  for (int y = 0; y < height; ++y) {
+    int x = 0;
+    do {
+      // sum fits into 16 bits only when bitdepth = 8.
+      int sum = 0;
+      for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+        sum +=
+            filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]);
+      }
+      sum += filter[kCenterTap] * source[x + kCenterTap];
+      const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
+      (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset);
+    } while (++x != width);
+    source += source_stride;
+    *wiener_buffer += width;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void WienerVertical(const int16_t* wiener_buffer, const int width,
+                           const int height, const int16_t* const filter,
+                           const int number_zero_coefficients, void* const dest,
+                           const ptrdiff_t dest_stride) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  auto* dst = static_cast<Pixel*>(dest);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      // sum needs 32 bits.
+      int sum = 0;
+      for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+        sum += filter[k] *
+               (wiener_buffer[k * width + x] +
+                wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]);
+      }
+      sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
+      const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+      dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
+    } while (++x != width);
+    wiener_buffer += width;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+// Note: bit range for wiener filter.
+// Wiener filter process first applies horizontal filtering to input pixels,
+// followed by rounding with predefined bits (dependent on bitdepth).
+// Then vertical filtering is applied, followed by rounding (dependent on
+// bitdepth).
+// The process is the same as convolution:
+// <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
+// --> <rounding 1>
+// By design:
+// (a). horizontal/vertical filtering adds 7 bits to input.
+// (b). The output of first rounding fits into 16 bits.
+// (c). The output of second rounding fits into 16 bits.
+// If input bitdepth > 8, the accumulator of the horizontal filter is larger
+// than 16 bit and smaller than 32 bits.
+// The accumulator of the vertical filter is larger than 16 bits and smaller
+// than 32 bits.
+// Note: range of wiener filter coefficients.
+// Wiener filter coefficients are symmetric, and their sum is 1 (128).
+// The range of each coefficient:
+// filter[0] = filter[6], 4 bits, min = -5, max = 10.
+// filter[1] = filter[5], 5 bits, min = -23, max = 8.
+// filter[2] = filter[4], 6 bits, min = -17, max = 46.
+// filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]).
+// The difference from libaom is that in libaom:
+// filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
+// Thus in libaom's computation, an offset of 128 is needed for filter[3].
+template <int bitdepth, typename Pixel>
+void WienerFilter_C(const RestorationUnitInfo& restoration_info,
+                    const void* const source, const void* const top_border,
+                    const void* const bottom_border, const ptrdiff_t stride,
+                    const int width, const int height,
+                    RestorationBuffer* const restoration_buffer,
+                    void* const dest) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer;
+
+  // horizontal filtering.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const int16_t* const filter_horizontal =
+      restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+  const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
+  const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
+  const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
+  auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
+
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
+                                      width, height_extra, filter_horizontal, 0,
+                                      &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 0, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
+                                      filter_horizontal, 0, &wiener_buffer);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
+                                      width, height_extra, filter_horizontal, 1,
+                                      &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 1, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
+                                      filter_horizontal, 1, &wiener_buffer);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
+                                      width, height_extra, filter_horizontal, 2,
+                                      &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 2, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
+                                      filter_horizontal, 2, &wiener_buffer);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
+                                      width, height_extra, filter_horizontal, 3,
+                                      &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 3, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
+                                      filter_horizontal, 3, &wiener_buffer);
+  }
+
+  // vertical filtering.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer, wiener_buffer - width,
+           sizeof(*wiener_buffer) * width);
+    memcpy(wiener_buffer_org, wiener_buffer_org + width,
+           sizeof(*wiener_buffer) * width);
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 0, dest, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 1, dest, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 2, dest, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 3, dest, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// When |height| is 1, |src_stride| could be set to arbitrary value.
+template <typename Pixel, int size>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+                                  const int height, const int width,
+                                  uint16_t* const* sums,
+                                  uint32_t* const* square_sums) {
+  int y = height;
+  do {
+    uint32_t sum = 0;
+    uint32_t square_sum = 0;
+    for (int dx = 0; dx < size; ++dx) {
+      const Pixel source = src[dx];
+      sum += source;
+      square_sum += source * source;
+    }
+    (*sums)[0] = sum;
+    (*square_sums)[0] = square_sum;
+    int x = 1;
+    do {
+      const Pixel source0 = src[x - 1];
+      const Pixel source1 = src[x - 1 + size];
+      sum -= source0;
+      sum += source1;
+      square_sum -= source0 * source0;
+      square_sum += source1 * source1;
+      (*sums)[x] = sum;
+      (*square_sums)[x] = square_sum;
+    } while (++x != width);
+    src += src_stride;
+    ++sums;
+    ++square_sums;
+  } while (--y != 0);
+}
+
+// When |height| is 1, |src_stride| could be set to arbitrary value.
+template <typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+                                  const int height, const int width,
+                                  uint16_t* const* sum3, uint16_t* const* sum5,
+                                  uint32_t* const* square_sum3,
+                                  uint32_t* const* square_sum5) {
+  int y = height;
+  do {
+    uint32_t sum = 0;
+    uint32_t square_sum = 0;
+    for (int dx = 0; dx < 4; ++dx) {
+      const Pixel source = src[dx];
+      sum += source;
+      square_sum += source * source;
+    }
+    int x = 0;
+    do {
+      const Pixel source0 = src[x];
+      const Pixel source1 = src[x + 4];
+      sum -= source0;
+      square_sum -= source0 * source0;
+      (*sum3)[x] = sum;
+      (*square_sum3)[x] = square_sum;
+      sum += source1;
+      square_sum += source1 * source1;
+      (*sum5)[x] = sum + source0;
+      (*square_sum5)[x] = square_sum + source0 * source0;
+    } while (++x != width);
+    src += src_stride;
+    ++sum3;
+    ++sum5;
+    ++square_sum3;
+    ++square_sum5;
+  } while (--y != 0);
+}
+
+template <int bitdepth, int n>
+inline void CalculateIntermediate(const uint32_t s, uint32_t a,
+                                  const uint32_t b, uint8_t* const ma_ptr,
+                                  uint32_t* const b_ptr) {
+  // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
+  // since max bitdepth = 12, max < 2^31.
+  // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
+  a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
+  // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
+  // d < 2^8 * n < 2^14 regardless of bitdepth
+  const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
+  // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+  // and p itself satisfies p < 2^14 * n^2 < 2^26.
+  // This bound on p is due to:
+  // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+  // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
+  // This is an artifact of rounding, and can only happen if all pixels
+  // are (almost) identical, so in this case we saturate to p=0.
+  const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
+  // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
+  // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
+  // (this holds even after accounting for the rounding in s)
+  const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+  // ma: range [0, 255].
+  const uint32_t ma = kSgrMaLookup[std::min(z, 255u)];
+  const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  // ma < 2^8, b < 2^(bitdepth) * n,
+  // one_over_n = round(2^12 / n)
+  // => the product here is < 2^(20 + bitdepth) <= 2^32,
+  // and b is set to a value < 2^(8 + bitdepth).
+  // This holds even with the rounding in one_over_n and in the overall result,
+  // as long as ma is strictly less than 2^8.
+  const uint32_t b2 = ma * b * one_over_n;
+  *ma_ptr = ma;
+  *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+}
+
+template <typename T>
+inline uint32_t Sum343(const T* const src) {
+  return 3 * (src[0] + src[2]) + 4 * src[1];
+}
+
+template <typename T>
+inline uint32_t Sum444(const T* const src) {
+  return 4 * (src[0] + src[1] + src[2]);
+}
+
+template <typename T>
+inline uint32_t Sum565(const T* const src) {
+  return 5 * (src[0] + src[2]) + 6 * src[1];
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    const int width, const uint32_t s, SgrBuffer* const sgr_buffer,
+    uint16_t* const ma565, uint32_t* const b565) {
+  int x = 0;
+  do {
+    uint32_t a = 0;
+    uint32_t b = 0;
+    for (int dy = 0; dy < 5; ++dy) {
+      a += square_sum5[dy][x];
+      b += sum5[dy][x];
+    }
+    CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x,
+                                        sgr_buffer->b + x);
+  } while (++x != width + 2);
+  x = 0;
+  do {
+    ma565[x] = Sum565(sgr_buffer->ma + x);
+    b565[x] = Sum565(sgr_buffer->b + x);
+  } while (++x != width);
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16_t* const sum3[3], const uint32_t* const square_sum3[3],
+    const int width, const uint32_t s, const bool calculate444,
+    SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343,
+    uint16_t* const ma444, uint32_t* const b444) {
+  int x = 0;
+  do {
+    uint32_t a = 0;
+    uint32_t b = 0;
+    for (int dy = 0; dy < 3; ++dy) {
+      a += square_sum3[dy][x];
+      b += sum3[dy][x];
+    }
+    CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x,
+                                       sgr_buffer->b + x);
+  } while (++x != width + 2);
+  x = 0;
+  do {
+    ma343[x] = Sum343(sgr_buffer->ma + x);
+    b343[x] = Sum343(sgr_buffer->b + x);
+  } while (++x != width);
+  if (calculate444) {
+    x = 0;
+    do {
+      ma444[x] = Sum444(sgr_buffer->ma + x);
+      b444[x] = Sum444(sgr_buffer->b + x);
+    } while (++x != width);
+  }
+}
+
+template <typename Pixel>
+inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma,
+                                   const uint32_t b, const int shift) {
+  const int32_t v = b - ma * src;
+  return RightShiftWithRounding(v,
+                                kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <typename Pixel>
+inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
+                                 const uint16_t* const ma565[2],
+                                 const uint32_t* const b565[2],
+                                 const ptrdiff_t x, int p[2]) {
+  p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
+                                        b565[0][x] + b565[1][x], 5);
+  p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
+}
+
+template <typename Pixel>
+inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
+                                const uint16_t* const ma444,
+                                const uint32_t* const b343[3],
+                                const uint32_t* const b444, const ptrdiff_t x) {
+  const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
+  const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
+  return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedFinal(const int src, const int v) {
+  // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+  // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+  // Then, range of s is bitdepth + 2. This is a rough estimation, taking the
+  // maximum value of each element.
+  const int s = src + RightShiftWithRounding(
+                          v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0,
+                                        const int filter1, const int16_t w0,
+                                        const int16_t w2) {
+  const int v = w0 * filter0 + w2 * filter1;
+  return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter,
+                                        const int16_t w0) {
+  const int v = w0 * filter;
+  return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
+                           uint16_t* const sum5[5],
+                           uint32_t* const square_sum5[5], const int width,
+                           const uint32_t scale, const int16_t w0,
+                           SgrBuffer* const sgr_buffer,
+                           uint16_t* const ma565[2], uint32_t* const b565[2],
+                           Pixel* dst) {
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                 ma565[1], b565[1]);
+  int x = 0;
+  do {
+    int p[2];
+    BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
+    dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
+    dst[stride + x] =
+        SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
+                           const int width, const uint16_t scale,
+                           const int16_t w0, uint16_t* const sum3[4],
+                           uint32_t* const square_sum3[4],
+                           SgrBuffer* const sgr_buffer,
+                           uint16_t* const ma343[4], uint16_t* const ma444[3],
+                           uint32_t* const b343[4], uint32_t* const b444[3],
+                           Pixel* dst) {
+  BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+                                 sgr_buffer, ma343[2], b343[2], ma444[1],
+                                 b444[1]);
+  int x = 0;
+  do {
+    const int p =
+        BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+    dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
+                      uint16_t* const sum3[4], uint16_t* const sum5[5],
+                      uint32_t* const square_sum3[4],
+                      uint32_t* const square_sum5[5], const int width,
+                      const uint16_t scales[2], const int16_t w0,
+                      const int16_t w2, SgrBuffer* const sgr_buffer,
+                      uint16_t* const ma343[4], uint16_t* const ma444[3],
+                      uint16_t* const ma565[2], uint32_t* const b343[4],
+                      uint32_t* const b444[3], uint32_t* const b565[2],
+                      Pixel* dst) {
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                 sgr_buffer, ma565[1], b565[1]);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
+                                 sgr_buffer, ma343[2], b343[2], ma444[1],
+                                 b444[1]);
+  BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+                                 true, sgr_buffer, ma343[3], b343[3], ma444[2],
+                                 b444[2]);
+  int x = 0;
+  do {
+    int p[2][2];
+    BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
+    p[1][0] =
+        BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+    p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
+                                          b343 + 1, b444[1], x);
+    dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+                                                         p[1][0], w0, w2);
+    dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+        src[stride + x], p[0][1], p[1][1], w0, w2);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
+                             const Pixel* src, const Pixel* const top_border,
+                             const Pixel* bottom_border, const ptrdiff_t stride,
+                             const int width, const int height,
+                             SgrBuffer* const sgr_buffer, Pixel* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum<Pixel>(top_border, stride, 2, width + 2, sum3, sum5 + 1, square_sum3,
+                square_sum5 + 1);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+                square_sum5 + 3);
+  const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
+                square_sum5 + 4);
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                 sgr_buffer, ma565[0], b565[0]);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+                                 sgr_buffer, ma343[0], b343[0], nullptr,
+                                 nullptr);
+  BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+                                 true, sgr_buffer, ma343[1], b343[1], ma444[0],
+                                 b444[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
+                  square_sum3 + 2, square_sum5 + 3);
+    BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+                               square_sum5, width, scales, w0, w2, sgr_buffer,
+                               ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const Pixel* sr;
+    ptrdiff_t s_stride;
+    if ((height & 1) == 0) {
+      sr = bottom_border;
+      s_stride = stride;
+    } else {
+      sr = src + 2 * stride;
+      s_stride = bottom_border - (src + 2 * stride);
+    }
+    BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
+                  square_sum3 + 2, square_sum5 + 3);
+    BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+                               square_sum5, width, scales, w0, w2, sgr_buffer,
+                               ma343, ma444, ma565, b343, b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxSum<Pixel>(bottom_border + stride, stride, 1, width + 2, sum3 + 2,
+                  sum5 + 3, square_sum3 + 2, square_sum5 + 3);
+    sum5[4] = sum5[3];
+    square_sum5[4] = square_sum5[3];
+    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                   sgr_buffer, ma565[1], b565[1]);
+    BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+                                   sgr_buffer, ma343[2], b343[2], nullptr,
+                                   nullptr);
+    int x = 0;
+    do {
+      const int p0 = CalculateFilteredOutput<Pixel>(
+          src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+      const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
+                                                 b444[0], x);
+      dst[x] =
+          SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
+    } while (++x != width);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const Pixel* src,
+                                  const Pixel* const top_border,
+                                  const Pixel* bottom_border,
+                                  const ptrdiff_t stride, const int width,
+                                  const int height, SgrBuffer* const sgr_buffer,
+                                  Pixel* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<Pixel, 5>(top_border, stride, 2, width + 2, sum5 + 1, square_sum5 + 1);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
+  const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                 ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
+                     square_sum5 + 3);
+    BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+                                    scale, w0, sgr_buffer, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const Pixel* sr;
+    ptrdiff_t s_stride;
+    if ((height & 1) == 0) {
+      sr = bottom_border;
+      s_stride = stride;
+    } else {
+      sr = src + 2 * stride;
+      s_stride = bottom_border - (src + 2 * stride);
+    }
+    BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
+    BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+                                    scale, w0, sgr_buffer, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxSum<Pixel, 5>(bottom_border + stride, stride, 1, width + 2, sum5 + 3,
+                     square_sum5 + 3);
+    sum5[4] = sum5[3];
+    square_sum5[4] = square_sum5[3];
+    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                   ma565[1], b565[1]);
+    int x = 0;
+    do {
+      const int p = CalculateFilteredOutput<Pixel>(
+          src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+      dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+    } while (++x != width);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const Pixel* src,
+                                  const Pixel* const top_border,
+                                  const Pixel* bottom_border,
+                                  const ptrdiff_t stride, const int width,
+                                  const int height, SgrBuffer* const sgr_buffer,
+                                  Pixel* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<Pixel, 3>(top_border, stride, 2, width + 2, sum3, square_sum3);
+  BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
+                                 sgr_buffer, ma343[0], b343[0], nullptr,
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const Pixel* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += stride;
+  }
+  BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+                                 sgr_buffer, ma343[1], b343[1], ma444[0],
+                                 b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
+                                    sum3, square_sum3, sgr_buffer, ma343, ma444,
+                                    b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  src += 2;
+  int y = std::min(height, 2);
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
+                                    square_sum3, sgr_buffer, ma343, ma444, b343,
+                                    b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info,
+                        const void* const source, const void* const top_border,
+                        const void* const bottom_border, const ptrdiff_t stride,
+                        const int width, const int height,
+                        RestorationBuffer* const restoration_buffer,
+                        void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* src = static_cast<const Pixel*>(source);
+  const auto* top = static_cast<const Pixel*>(top_border);
+  const auto* bottom = static_cast<const Pixel*>(bottom_border);
+  auto* dst = static_cast<Pixel*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1<bitdepth, Pixel>(restoration_info, src - 3, top - 3,
+                                           bottom - 3, stride, width, height,
+                                           sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2<bitdepth, Pixel>(restoration_info, src - 2, top - 2,
+                                           bottom - 2, stride, width, height,
+                                           sgr_buffer, dst);
+  } else {
+    BoxFilterProcess<bitdepth, Pixel>(restoration_info, src - 3, top - 3,
+                                      bottom - 3, stride, width, height,
+                                      sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+  dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+  dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}  // namespace
+
+void LoopRestorationInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_restoration.h b/src/dsp/loop_restoration.h
new file mode 100644
index 0000000..de80926
--- /dev/null
+++ b/src/dsp/loop_restoration.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+#define LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_restoration_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_restoration_avx2.h"
+#include "src/dsp/x86/loop_restoration_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+enum {
+  // Precision of a division table (mtable)
+  kSgrProjScaleBits = 20,
+  kSgrProjReciprocalBits = 12,
+  // Core self-guided restoration precision bits.
+  kSgrProjSgrBits = 8,
+  // Precision bits of generated values higher than source before projection.
+  kSgrProjRestoreBits = 4
+};  // anonymous enum
+
+extern const uint8_t kSgrMaLookup[256];
+
+// Initializes Dsp::loop_restorations. This function is not thread-safe.
+void LoopRestorationInit_C();
+
+template <typename T>
+void Circulate3PointersBy1(T* p[3]) {
+  T* const p0 = p[0];
+  p[0] = p[1];
+  p[1] = p[2];
+  p[2] = p0;
+}
+
+template <typename T>
+void Circulate4PointersBy2(T* p[4]) {
+  std::swap(p[0], p[2]);
+  std::swap(p[1], p[3]);
+}
+
+template <typename T>
+void Circulate5PointersBy2(T* p[5]) {
+  T* const p0 = p[0];
+  T* const p1 = p[1];
+  p[0] = p[2];
+  p[1] = p[3];
+  p[2] = p[4];
+  p[3] = p0;
+  p[4] = p1;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc
new file mode 100644
index 0000000..101c410
--- /dev/null
+++ b/src/dsp/mask_blend.cc
@@ -0,0 +1,207 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int subsampling_x, int subsampling_y>
+uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x) {
+  if ((subsampling_x | subsampling_y) == 0) {
+    return mask[x];
+  }
+  if (subsampling_x == 1 && subsampling_y == 0) {
+    return static_cast<uint8_t>(RightShiftWithRounding(
+        mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1], 1));
+  }
+  assert(subsampling_x == 1 && subsampling_y == 1);
+  return static_cast<uint8_t>(RightShiftWithRounding(
+      mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1] +
+          mask_next_row[MultiplyBy2(x)] + mask_next_row[MultiplyBy2(x) + 1],
+      2));
+}
+
+template <int bitdepth, typename Pixel, bool is_inter_intra, int subsampling_x,
+          int subsampling_y>
+void MaskBlend_C(const void* prediction_0, const void* prediction_1,
+                 const ptrdiff_t prediction_stride_1, const uint8_t* mask,
+                 const ptrdiff_t mask_stride, const int width, const int height,
+                 void* dest, const ptrdiff_t dest_stride) {
+  static_assert(!(bitdepth == 8 && is_inter_intra), "");
+  assert(mask != nullptr);
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+  constexpr int step_y = subsampling_y ? 2 : 1;
+  const uint8_t* mask_next_row = mask + mask_stride;
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value =
+          GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+      if (is_inter_intra) {
+        dst[x] = static_cast<Pixel>(RightShiftWithRounding(
+            mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
+      } else {
+        assert(prediction_stride_1 == width);
+        int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+        res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+        dst[x] = static_cast<Pixel>(
+            Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+                  (1 << bitdepth) - 1));
+      }
+    }
+    dst += dst_stride;
+    mask += mask_stride * step_y;
+    mask_next_row += mask_stride * step_y;
+    pred_0 += width;
+    pred_1 += prediction_stride_1;
+  }
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_C(const uint8_t* prediction_0,
+                               uint8_t* prediction_1,
+                               const ptrdiff_t prediction_stride_1,
+                               const uint8_t* mask, const ptrdiff_t mask_stride,
+                               const int width, const int height) {
+  assert(mask != nullptr);
+  constexpr int step_y = subsampling_y ? 2 : 1;
+  const uint8_t* mask_next_row = mask + mask_stride;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value =
+          GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+      prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
+          mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
+          6));
+    }
+    mask += mask_stride * step_y;
+    mask_next_row += mask_stride * step_y;
+    prediction_0 += width;
+    prediction_1 += prediction_stride_1;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->mask_blend[0][1] = nullptr;
+  dsp->mask_blend[1][1] = nullptr;
+  dsp->mask_blend[2][1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+  dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+  dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+  dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+#endif
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->mask_blend[0][1] = nullptr;
+  dsp->mask_blend[1][1] = nullptr;
+  dsp->mask_blend[2][1] = nullptr;
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+  dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+  dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+  dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+  dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+  dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+  dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+  dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+  dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+  dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+#endif
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void MaskBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/mask_blend.h b/src/dsp/mask_blend.h
new file mode 100644
index 0000000..41f5e5b
--- /dev/null
+++ b/src/dsp/mask_blend.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MASK_BLEND_H_
+#define LIBGAV1_SRC_DSP_MASK_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/mask_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/mask_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend and Dsp::inter_intra_mask_blend_8bpp. This
+// function is not thread-safe.
+void MaskBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MASK_BLEND_H_
diff --git a/src/dsp/motion_field_projection.cc b/src/dsp/motion_field_projection.cc
new file mode 100644
index 0000000..b51ec8f
--- /dev/null
+++ b/src/dsp/motion_field_projection.cc
@@ -0,0 +1,138 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when MotionFieldProjectionKernel_C is
+// not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||                      \
+    !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel) || \
+    (LIBGAV1_MAX_BITDEPTH >= 10 &&                           \
+     !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel))
+
+// 7.9.2.
+void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info,
+                                   int reference_to_current_with_sign,
+                                   int dst_sign, int y8_start, int y8_end,
+                                   int x8_start, int x8_end,
+                                   TemporalMotionField* motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);
+    int x8 = adjusted_x8_start;
+    do {
+      const int source_reference_type = source_reference_types[x8];
+      if (skip_references[source_reference_type]) continue;
+      MotionVector projection_mv;
+      // reference_to_current_with_sign could be 0.
+      GetMvProjection(mv[x8], reference_to_current_with_sign,
+                      projection_divisions[source_reference_type],
+                      &projection_mv);
+      // Do not update the motion vector if the block position is not valid or
+      // if position_x8 is outside the current range of x8_start and x8_end.
+      // Note that position_y8 will always be within the range of y8_start and
+      // y8_end.
+      const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+      if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+      const int x8_base = x8 & ~7;
+      const int x8_floor =
+          std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+      const int x8_ceiling =
+          std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+      const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+      if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+      dst_mv[position_y8 * stride + position_x8] = mv[x8];
+      dst_reference_offset[position_y8 * stride + position_x8] =
+          reference_offsets[source_reference_type];
+    } while (++x8 < adjusted_x8_end);
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel) ||
+        // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+        //  !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel))
+
+void Init8bpp() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C;
+#endif
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel)
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C;
+#endif
+}
+#endif
+
+}  // namespace
+
+void MotionFieldProjectionInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_field_projection.h b/src/dsp/motion_field_projection.h
new file mode 100644
index 0000000..36de459
--- /dev/null
+++ b/src/dsp/motion_field_projection.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+#define LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_field_projection_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_field_projection_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
diff --git a/src/dsp/motion_vector_search.cc b/src/dsp/motion_vector_search.cc
new file mode 100644
index 0000000..9402302
--- /dev/null
+++ b/src/dsp/motion_vector_search.cc
@@ -0,0 +1,211 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when the C functions are not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||             \
+    !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch) || \
+    (LIBGAV1_MAX_BITDEPTH >= 10 &&                  \
+     !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch))
+
+void MvProjectionCompoundLowPrecision_C(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+        for (auto& mv : candidate_mvs[index].mv[i].mv) {
+          // The next line is equivalent to:
+          // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+          mv = (mv - (mv >> 15)) & ~1;
+        }
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionCompoundForceInteger_C(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+        for (auto& mv : candidate_mvs[index].mv[i].mv) {
+          // The next line is equivalent to:
+          // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+          // const int sign = mv >> 15;
+          // mv = ApplySign(value, sign);
+          mv = (mv + 3 - (mv >> 15)) & ~7;
+        }
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionCompoundHighPrecision_C(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleLowPrecision_C(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets, const int reference_offset,
+    const int count, MotionVector* const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+    for (auto& mv : candidate_mvs[index].mv) {
+      // The next line is equivalent to:
+      // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+      mv = (mv - (mv >> 15)) & ~1;
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleForceInteger_C(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets, const int reference_offset,
+    const int count, MotionVector* const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+    for (auto& mv : candidate_mvs[index].mv) {
+      // The next line is equivalent to:
+      // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+      // const int sign = mv >> 15;
+      // mv = ApplySign(value, sign);
+      mv = (mv + 3 - (mv >> 15)) & ~7;
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleHighPrecision_C(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets, const int reference_offset,
+    const int count, MotionVector* const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+  } while (++index < count);
+}
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch) ||
+        // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+        //  !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch))
+
+void Init8bpp() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C;
+#endif
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch)
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C;
+#endif
+}
+#endif
+
+}  // namespace
+
+void MotionVectorSearchInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_vector_search.h b/src/dsp/motion_vector_search.h
new file mode 100644
index 0000000..ae16726
--- /dev/null
+++ b/src/dsp/motion_vector_search.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+#define LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_vector_search_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_vector_search_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
diff --git a/src/dsp/obmc.cc b/src/dsp/obmc.cc
new file mode 100644
index 0000000..46d1b5b
--- /dev/null
+++ b/src/dsp/obmc.cc
@@ -0,0 +1,125 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+// 7.11.3.10 (from top samples).
+template <typename Pixel>
+void OverlapBlendVertical_C(void* const prediction,
+                            const ptrdiff_t prediction_stride, const int width,
+                            const int height, const void* const obmc_prediction,
+                            const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<Pixel*>(prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+  const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+  const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+  const uint8_t* const mask = kObmcMask + height - 2;
+
+  for (int y = 0; y < height; ++y) {
+    const uint8_t mask_value = mask[y];
+    for (int x = 0; x < width; ++x) {
+      pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+          mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+    }
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  }
+}
+
+// 7.11.3.10 (from left samples).
+template <typename Pixel>
+void OverlapBlendHorizontal_C(void* const prediction,
+                              const ptrdiff_t prediction_stride,
+                              const int width, const int height,
+                              const void* const obmc_prediction,
+                              const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<Pixel*>(prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+  const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+  const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+  const uint8_t* const mask = kObmcMask + width - 2;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value = mask[x];
+      pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+          mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+    }
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void ObmcInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/obmc.h b/src/dsp/obmc.h
new file mode 100644
index 0000000..3b826c7
--- /dev/null
+++ b/src/dsp/obmc.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_OBMC_H_
+#define LIBGAV1_SRC_DSP_OBMC_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/obmc_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/obmc_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_OBMC_H_
diff --git a/src/dsp/obmc.inc b/src/dsp/obmc.inc
new file mode 100644
index 0000000..001c6ee
--- /dev/null
+++ b/src/dsp/obmc.inc
@@ -0,0 +1,32 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for overlap blend implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// This is a flat array of masks for each block dimension from 2 to 32. The
+// starting index for each length is length-2.
+constexpr uint8_t kObmcMask[62] = {
+    // Obmc Mask 2
+    45, 64,
+    // Obmc Mask 4
+    39, 50, 59, 64,
+    // Obmc Mask 8
+    36, 42, 48, 53, 57, 61, 64, 64,
+    // Obmc Mask 16
+    34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
+    // Obmc Mask 32
+    33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
+    59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
new file mode 100644
index 0000000..d041bd1
--- /dev/null
+++ b/src/dsp/super_res.cc
@@ -0,0 +1,109 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cassert>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void SuperRes_C(const void* /*coefficients*/, void* const source,
+                const ptrdiff_t stride, const int height,
+                const int downscaled_width, const int upscaled_width,
+                const int initial_subpixel_x, const int step,
+                void* const dest) {
+  assert(step <= 1 << kSuperResScaleBits);
+  auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<Pixel*>(dest);
+  int y = height;
+  do {
+    ExtendLine<Pixel>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                      kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    // If (original) upscaled_width is <= 9, the downscaled_width may be
+    // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
+    // subsampled via RightShiftWithRounding. This leads to an edge case where
+    // |step| == 1 << 14.
+    int subpixel_x = initial_subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
+      const int src_x_subpixel =
+          (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
+      // The sign of each tap is: - + - + + - + -
+      sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
+      sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
+      sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
+      sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
+      sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
+      sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
+      sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
+      sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
+      dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0,
+                     (1 << bitdepth) - 1);
+      subpixel_x += step;
+    } while (++x < upscaled_width);
+    src += stride;
+    dst += stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->super_res = SuperRes_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+  dsp->super_res = SuperRes_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->super_res = SuperRes_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+  dsp->super_res = SuperRes_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void SuperResInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/super_res.h b/src/dsp/super_res.h
new file mode 100644
index 0000000..2ca9d2b
--- /dev/null
+++ b/src/dsp/super_res.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_SUPER_RES_H_
+#define LIBGAV1_SRC_DSP_SUPER_RES_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/super_res_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/super_res_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_SUPER_RES_H_
diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc
new file mode 100644
index 0000000..fbde65a
--- /dev/null
+++ b/src/dsp/warp.cc
@@ -0,0 +1,475 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// Warp prediction output ranges from WarpTest.ShowRange.
+// Bitdepth:  8 Input range:            [       0,      255]
+//   8bpp intermediate offset: 16384.
+//   intermediate range:                [    4399,    61009]
+//   first pass output range:           [     550,     7626]
+//   8bpp intermediate offset removal: 262144.
+//   intermediate range:                [ -620566,  1072406]
+//   second pass output range:          [       0,      255]
+//   compound second pass output range: [   -4848,     8378]
+//
+// Bitdepth: 10 Input range:            [       0,     1023]
+//   intermediate range:                [  -48081,   179025]
+//   first pass output range:           [   -6010,    22378]
+//   intermediate range:                [-2103516,  4198620]
+//   second pass output range:          [       0,     1023]
+//   compound second pass output range: [    8142,    57378]
+//
+// Bitdepth: 12 Input range:            [       0,     4095]
+//   intermediate range:                [ -192465,   716625]
+//   first pass output range:           [   -6015,    22395]
+//   intermediate range:                [-2105190,  4201830]
+//   second pass output range:          [       0,     4095]
+//   compound second pass output range: [    8129,    57403]
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void Warp_C(const void* const source, ptrdiff_t source_stride,
+            const int source_width, const int source_height,
+            const int* const warp_params, const int subsampling_x,
+            const int subsampling_y, const int block_start_x,
+            const int block_start_y, const int block_width,
+            const int block_height, const int16_t alpha, const int16_t beta,
+            const int16_t gamma, const int16_t delta, void* dest,
+            ptrdiff_t dest_stride) {
+  assert(block_width >= 8 && block_height >= 8);
+  if (is_compound) {
+    assert(dest_stride == block_width);
+  }
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      is_compound        ? kInterRoundBitsCompoundVertical
+      : (bitdepth == 12) ? kInterRoundBitsVertical12bpp
+                         : kInterRoundBitsVertical;
+
+  // Only used for 8bpp. Allows for keeping the first pass intermediates within
+  // uint16_t. With 10/12bpp the intermediate value will always require int32_t.
+  constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0;
+  constexpr int offset_removal =
+      (first_pass_offset >> kRoundBitsHorizontal) * 128;
+
+  constexpr int kMaxPixel = (1 << bitdepth) - 1;
+  union {
+    // |intermediate_result| is the output of the horizontal filtering and
+    // rounding. The range is within int16_t.
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+  const auto* const src = static_cast<const Pixel*>(source);
+  source_stride /= sizeof(Pixel);
+  using DestType =
+      typename std::conditional<is_compound, uint16_t, Pixel>::type;
+  auto* dst = static_cast<DestType*>(dest);
+  if (!is_compound) dest_stride /= sizeof(dst[0]);
+
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+
+  // Warp process applies for each 8x8 block (or smaller).
+  for (int start_y = block_start_y; start_y < block_start_y + block_height;
+       start_y += 8) {
+    for (int start_x = block_start_x; start_x < block_start_x + block_width;
+         start_x += 8) {
+      const int src_x = (start_x + 4) << subsampling_x;
+      const int src_y = (start_y + 4) << subsampling_y;
+      const int dst_x =
+          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+      const int dst_y =
+          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+      const int x4 = dst_x >> subsampling_x;
+      const int y4 = dst_y >> subsampling_y;
+      const int ix4 = x4 >> kWarpedModelPrecisionBits;
+      const int iy4 = y4 >> kWarpedModelPrecisionBits;
+
+      // A prediction block may fall outside the frame's boundaries. If a
+      // prediction block is calculated using only samples outside the frame's
+      // boundary, the filtering can be simplified. We can divide the plane
+      // into several regions and handle them differently.
+      //
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //         -------+-----------+-------
+      //                |***********|
+      //            2   |*****4*****|   2
+      //                |***********|
+      //         -------+-----------+-------
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //
+      // At the center, region 4 represents the frame and is the general case.
+      //
+      // In regions 1 and 2, the prediction block is outside the frame's
+      // boundary horizontally. Therefore the horizontal filtering can be
+      // simplified. Furthermore, in the region 1 (at the four corners), the
+      // prediction is outside the frame's boundary both horizontally and
+      // vertically, so we get a constant prediction block.
+      //
+      // In region 3, the prediction block is outside the frame's boundary
+      // vertically. Unfortunately because we apply the horizontal filters
+      // first, by the time we apply the vertical filters, they no longer see
+      // simple inputs. So the only simplification is that all the rows are
+      // the same, but we still need to apply all the horizontal and vertical
+      // filters.
+
+      // Check for two simple special cases, where the horizontal filter can
+      // be significantly simplified.
+      //
+      // In general, for each row, the horizontal filter is calculated as
+      // follows:
+      //   for (int x = -4; x < 4; ++x) {
+      //     const int offset = ...;
+      //     int sum = first_pass_offset;
+      //     for (int k = 0; k < 8; ++k) {
+      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+      //       sum += kWarpedFilters[offset][k] * src_row[column];
+      //     }
+      //     ...
+      //   }
+      // The column index before clipping, ix4 + x + k - 3, varies in the range
+      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+      // border index (source_width - 1 or 0, respectively). Then for each x,
+      // the inner for loop of the horizontal filter is reduced to multiplying
+      // the border pixel by the sum of the filter coefficients.
+      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+        // Regions 1 and 2.
+        // Points to the left or right border of the first row of |src|.
+        const Pixel* first_row_border =
+            (ix4 + 7 <= 0) ? src : src + source_width - 1;
+        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+        // In two special cases, iy4 + y is clipped to either 0 or
+        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+        // bounded and we can avoid clipping iy4 + y by relying on a reference
+        // frame's boundary extension on the top and bottom.
+        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+          // Region 1.
+          // Every sample used to calculate the prediction block has the same
+          // value. So the whole prediction block has the same value.
+          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const Pixel row_border_pixel = first_row_border[row * source_stride];
+          DestType* dst_row = dst + start_x - block_start_x;
+          if (is_compound) {
+            int sum = row_border_pixel
+                      << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical);
+            sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+            Memset(dst_row, sum, 8);
+          } else {
+            Memset(dst_row, row_border_pixel, 8);
+          }
+          const DestType* const first_dst_row = dst_row;
+          dst_row += dest_stride;
+          for (int y = 1; y < 8; ++y) {
+            memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+            dst_row += dest_stride;
+          }
+          // End of region 1. Continue the |start_x| for loop.
+          continue;
+        }
+
+        // Region 2.
+        // Horizontal filter.
+        // The input values in this region are generated by extending the border
+        // which makes them identical in the horizontal direction. This
+        // computation could be inlined in the vertical pass but most
+        // implementations will need a transpose of some sort.
+        // It is not necessary to use the offset values here because the
+        // horizontal pass is a simple shift and the vertical pass will always
+        // require using 32 bits.
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved below.
+          const int row = iy4 + y;
+          int sum = first_row_border[row * source_stride];
+          sum <<= kFilterBits - kRoundBitsHorizontal;
+          intermediate_result_column[y + 7] = sum;
+        }
+        // Vertical filter.
+        DestType* dst_row = dst + start_x - block_start_x;
+        int sy4 =
+            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+        for (int y = 0; y < 8; ++y) {
+          int sy = sy4 - MultiplyBy4(gamma);
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            int sum = 0;
+            for (int k = 0; k < 8; ++k) {
+              sum +=
+                  kWarpedFilters[offset][k] * intermediate_result_column[y + k];
+            }
+            sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+            if (is_compound) {
+              sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+              dst_row[x] = static_cast<DestType>(sum);
+            } else {
+              dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+            }
+            sy += gamma;
+          }
+          dst_row += dest_stride;
+          sy4 += delta;
+        }
+        // End of region 2. Continue the |start_x| for loop.
+        continue;
+      }
+
+      // Regions 3 and 4.
+      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+      // It follows that -6 <= ix4 <= source_width + 5. This inequality is
+      // used below.
+
+      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+      // In two special cases, iy4 + y is clipped to either 0 or
+      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+      // bounded and we can avoid clipping iy4 + y by relying on a reference
+      // frame's boundary extension on the top and bottom.
+      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+        // Region 3.
+        // Horizontal filter.
+        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const Pixel* const src_row = src + row * source_stride;
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          int sx = sx4 - MultiplyBy4(alpha);
+          for (int x = -4; x < 4; ++x) {
+            const int offset =
+                RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            // Since alpha and beta have been validated by SetupShear(), one
+            // can prove that 0 <= offset <= 3 * 2^6.
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            // For SIMD optimization:
+            // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+            // For 10/12 bit, the range of sum requires 32 bits.
+            int sum = first_pass_offset;
+            for (int k = 0; k < 8; ++k) {
+              // We assume the source frame has left and right borders of at
+              // least 13 pixels that extend the frame boundary pixels.
+              //
+              // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+              // ix4 above, we have
+              //   -13 <= ix4 + x + k - 3 <= source_width + 12,
+              // or
+              //   -13 <= column <= (source_width - 1) + 13.
+              // Therefore we may over-read up to 13 pixels before the source
+              // row, or up to 13 pixels after the source row.
+              const int column = ix4 + x + k - 3;
+              sum += kWarpedFilters[offset][k] * src_row[column];
+            }
+            intermediate_result[y + 7][x + 4] =
+                RightShiftWithRounding(sum, kRoundBitsHorizontal);
+            sx += alpha;
+          }
+          sx4 += beta;
+        }
+      } else {
+        // Region 4.
+        // Horizontal filter.
+        // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
+        // It follows that -6 <= iy4 <= source_height + 5. This inequality is
+        // used below.
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          // We assume the source frame has top and bottom borders of at least
+          // 13 pixels that extend the frame boundary pixels.
+          //
+          // Since -7 <= y <= 7, using the inequality on iy4 above, we have
+          //   -13 <= iy4 + y <= source_height + 12,
+          // or
+          //   -13 <= row <= (source_height - 1) + 13.
+          // Therefore we may over-read up to 13 pixels above the top source
+          // row, or up to 13 pixels below the bottom source row.
+          const int row = iy4 + y;
+          const Pixel* const src_row = src + row * source_stride;
+          int sx = sx4 - MultiplyBy4(alpha);
+          for (int x = -4; x < 4; ++x) {
+            const int offset =
+                RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            // Since alpha and beta have been validated by SetupShear(), one
+            // can prove that 0 <= offset <= 3 * 2^6.
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            // For SIMD optimization:
+            // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+            // For 10/12 bit, the range of sum requires 32 bits.
+            int sum = first_pass_offset;
+            for (int k = 0; k < 8; ++k) {
+              // We assume the source frame has left and right borders of at
+              // least 13 pixels that extend the frame boundary pixels.
+              //
+              // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+              // ix4 above, we have
+              //   -13 <= ix4 + x + k - 3 <= source_width + 12,
+              // or
+              //   -13 <= column <= (source_width - 1) + 13.
+              // Therefore we may over-read up to 13 pixels before the source
+              // row, or up to 13 pixels after the source row.
+              const int column = ix4 + x + k - 3;
+              sum += kWarpedFilters[offset][k] * src_row[column];
+            }
+            intermediate_result[y + 7][x + 4] =
+                RightShiftWithRounding(sum, kRoundBitsHorizontal) -
+                offset_removal;
+            sx += alpha;
+          }
+          sx4 += beta;
+        }
+      }
+
+      // Regions 3 and 4.
+      // Vertical filter.
+      DestType* dst_row = dst + start_x - block_start_x;
+      int sy4 =
+          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+      // The spec says we should use the following loop condition:
+      //   y < std::min(4, block_start_y + block_height - start_y - 4);
+      // We can prove that block_start_y + block_height - start_y >= 8, which
+      // implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
+      // So the loop condition is simply y < 4.
+      //
+      //   Proof:
+      //      start_y < block_start_y + block_height
+      //   => block_start_y + block_height - start_y > 0
+      //   => block_height - (start_y - block_start_y) > 0
+      //
+      //   Since block_height >= 8 and is a power of 2, it follows that
+      //   block_height is a multiple of 8. start_y - block_start_y is also a
+      //   multiple of 8. Therefore their difference is a multiple of 8. Since
+      //   their difference is > 0, their difference must be >= 8.
+      //
+      // We then add an offset of 4 to y so that the loop starts with y = 0
+      // and continues if y < 8.
+      for (int y = 0; y < 8; ++y) {
+        int sy = sy4 - MultiplyBy4(gamma);
+        // The spec says we should use the following loop condition:
+        //   x < std::min(4, block_start_x + block_width - start_x - 4);
+        // Similar to the above, we can prove that the loop condition can be
+        // simplified to x < 4.
+        //
+        // We then add an offset of 4 to x so that the loop starts with x = 0
+        // and continues if x < 8.
+        for (int x = 0; x < 8; ++x) {
+          const int offset =
+              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+              kWarpedPixelPrecisionShifts;
+          // Since gamma and delta have been validated by SetupShear(), one can
+          // prove that 0 <= offset <= 3 * 2^6.
+          assert(offset >= 0);
+          assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+          int sum = 0;
+          for (int k = 0; k < 8; ++k) {
+            sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x];
+          }
+          sum -= offset_removal;
+          sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+          if (is_compound) {
+            sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+            dst_row[x] = static_cast<DestType>(sum);
+          } else {
+            dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+          }
+          sy += gamma;
+        }
+        dst_row += dest_stride;
+        sy4 += delta;
+      }
+    }
+    dst += 8 * dest_stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_Warp
+  dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_Warp
+  dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WarpCompound
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void WarpInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/warp.h b/src/dsp/warp.h
new file mode 100644
index 0000000..7367a9b
--- /dev/null
+++ b/src/dsp/warp.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WARP_H_
+#define LIBGAV1_SRC_DSP_WARP_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/warp_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/warp_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_WARP_H_
diff --git a/src/dsp/weight_mask.cc b/src/dsp/weight_mask.cc
new file mode 100644
index 0000000..15d6bc6
--- /dev/null
+++ b/src/dsp/weight_mask.cc
@@ -0,0 +1,227 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int width, int height, int bitdepth, bool mask_is_inverse>
+void WeightMask_C(const void* prediction_0, const void* prediction_1,
+                  uint8_t* mask, ptrdiff_t mask_stride) {
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  static_assert(width >= 8, "");
+  static_assert(height >= 8, "");
+  constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const int difference = RightShiftWithRounding(
+          std::abs(pred_0[x] - pred_1[x]), rounding_bits);
+      const auto mask_value =
+          static_cast<uint8_t>(std::min(DivideBy16(difference) + 38, 64));
+      mask[x] = mask_is_inverse ? 64 - mask_value : mask_value;
+    }
+    pred_0 += width;
+    pred_1 += width;
+    mask += mask_stride;
+  }
+}
+
+#define INIT_WEIGHT_MASK(width, height, bitdepth, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                           \
+      WeightMask_C<width, height, bitdepth, 0>;                     \
+  dsp->weight_mask[w_index][h_index][1] =                           \
+      WeightMask_C<width, height, bitdepth, 1>
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+  INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+  INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+  INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+  INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+  INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+  INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+  INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+  INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+  INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+  INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+  INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+  INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+  INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+  INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+  INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+  INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+  INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+  INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+  INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+  INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+  INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+  INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+  INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+  INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+  INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+  INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+  INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+  INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+  INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+  INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+  INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+  INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+  INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+  INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+  INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+  INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+  INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+  INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+  INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+  INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+  INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+  INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+  INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+  INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+  INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+  INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+  INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+  INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+  INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+  INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+  INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+  INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+  INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+  INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+  INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+  INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+  INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+  INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+  INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+  INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+  INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+  INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+  INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+  INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+  INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+  INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void WeightMaskInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/weight_mask.h b/src/dsp/weight_mask.h
new file mode 100644
index 0000000..43bef05
--- /dev/null
+++ b/src/dsp/weight_mask.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+#define LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/weight_mask_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/weight_mask_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
new file mode 100644
index 0000000..8e008d1
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -0,0 +1,156 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline void AverageBlend4Row(const int16_t* prediction_0,
+                             const int16_t* prediction_1, uint8_t* dest) {
+  const __m128i pred_0 = LoadLo8(prediction_0);
+  const __m128i pred_1 = LoadLo8(prediction_1);
+  __m128i res = _mm_add_epi16(pred_0, pred_1);
+  res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+  Store4(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlend8Row(const int16_t* prediction_0,
+                             const int16_t* prediction_1, uint8_t* dest) {
+  const __m128i pred_0 = LoadAligned16(prediction_0);
+  const __m128i pred_1 = LoadAligned16(prediction_1);
+  __m128i res = _mm_add_epi16(pred_0, pred_1);
+  res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+  StoreLo8(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlendLargeRow(const int16_t* prediction_0,
+                                 const int16_t* prediction_1, const int width,
+                                 uint8_t* dest) {
+  int x = 0;
+  do {
+    const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
+    const __m128i pred_01 = LoadAligned16(&prediction_1[x]);
+    __m128i res0 = _mm_add_epi16(pred_00, pred_01);
+    res0 = RightShiftWithRounding_S16(res0, kInterPostRoundBit + 1);
+    const __m128i pred_10 = LoadAligned16(&prediction_0[x + 8]);
+    const __m128i pred_11 = LoadAligned16(&prediction_1[x + 8]);
+    __m128i res1 = _mm_add_epi16(pred_10, pred_11);
+    res1 = RightShiftWithRounding_S16(res1, kInterPostRoundBit + 1);
+    StoreUnaligned16(dest + x, _mm_packus_epi16(res0, res1));
+    x += 16;
+  } while (x < width);
+}
+
+void AverageBlend_SSE4_1(const void* prediction_0, const void* prediction_1,
+                         const int width, const int height, void* const dest,
+                         const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = height;
+
+  if (width == 4) {
+    do {
+      // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
+      // to load 8 values at a time.
+      AverageBlend4Row(pred_0, pred_1, dst);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+
+      AverageBlend4Row(pred_0, pred_1, dst);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      AverageBlend8Row(pred_0, pred_1, dst);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+
+      AverageBlend8Row(pred_0, pred_1, dst);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(AverageBlend)
+  dsp->average_blend = AverageBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void AverageBlendInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h
new file mode 100644
index 0000000..937e8e2
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
new file mode 100644
index 0000000..3211a2d
--- /dev/null
+++ b/src/dsp/x86/cdef_sse4.cc
@@ -0,0 +1,728 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = {
+    420, 210, 140, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16,
+                                            __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  // 00 01 02 03 04 05 06 07
+  *partial_lo = v_src_16[0];
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm_setzero_si128();
+
+  // 00 10 11 12 13 14 15 16
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2));
+  // 17 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4));
+  // 26 27 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16,
+                                            __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  __m128i v_d1_temp[8];
+  const __m128i v_zero = _mm_setzero_si128();
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = v_zero;
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  __m128i v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]);
+  v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]);
+  v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]);
+  v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm_setzero_si128();
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
+                                      __m128i* partial_lo,
+                                      __m128i* partial_hi) {
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  __m128i v_src[8];
+  for (auto& i : v_src) {
+    i = LoadLo8(src);
+    src += stride;
+  }
+
+  const __m128i v_zero = _mm_setzero_si128();
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  00 00 00 00 00 00 00 00
+  // 01 11 21 33 41 51 61 71  00 00 00 00 00 00 00 00
+  // 02 12 22 33 42 52 62 72  00 00 00 00 00 00 00 00
+  // 03 13 23 33 43 53 63 73  00 00 00 00 00 00 00 00
+  // 04 14 24 34 44 54 64 74  00 00 00 00 00 00 00 00
+  // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
+  // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
+  // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
+  const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]);
+  const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]);
+  const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]);
+  const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]);
+  const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero);
+  const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero);
+  const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero);
+  const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero);
+  const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+  const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+  partial_lo[2] =
+      _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+                         _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+  __m128i v_src_16[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]);
+  }
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00 00
+  // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00 00
+  // 20 21 22 23 24 25 26 27  00 00 00 00 00 00 00 00
+  // 30 31 32 33 34 35 36 37  00 00 00 00 00 00 00 00
+  // 40 41 42 43 44 45 46 47  00 00 00 00 00 00 00 00
+  // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
+  // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
+  // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
+  partial_lo[6] = v_src_16[0];
+  for (int i = 1; i < 8; ++i) {
+    partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]);
+  }
+
+  // partial for direction 0
+  AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]);
+
+  // partial for direction 1
+  AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]);
+
+  // partial for direction 7
+  AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]);
+
+  __m128i v_src_reverse[8];
+  const __m128i reverser =
+      _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+  for (int i = 0; i < 8; ++i) {
+    v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser);
+  }
+
+  // partial for direction 4
+  AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+  // partial for direction 3
+  AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+  // partial for direction 5
+  AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+inline uint32_t SumVector_S32(__m128i a) {
+  a = _mm_hadd_epi32(a, a);
+  a = _mm_add_epi32(a, _mm_srli_si128(a, 4));
+  return _mm_cvtsi128_si32(a);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
+                         const __m128i division_table[2]) {
+  // Reverse and clear upper 2 bytes.
+  const __m128i reverser =
+      _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c);
+  // 14 13 12 11 10 09 08 ZZ
+  const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+  // 00 14 01 13 02 12 03 11
+  const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+  // 04 10 05 09 06 08 07 ZZ
+  const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+  const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+  const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+  const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+  return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+inline uint32_t CostOdd(const __m128i a, const __m128i b,
+                        const __m128i division_table[2]) {
+  // Reverse and clear upper 10 bytes.
+  const __m128i reverser =
+      _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504);
+  // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+  const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+  // 00 10 01 09 02 08 03 ZZ
+  const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+  // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+  const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][10 - i])
+  const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+  const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+  const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+  const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+  return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+// Sum of squared elements.
+inline uint32_t SquareSum_S16(const __m128i a) {
+  const __m128i square = _mm_madd_epi16(a, a);
+  return SumVector_S32(square);
+}
+
+void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
+                          uint8_t* const direction, int* const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+  __m128i partial_lo[8], partial_hi[8];
+
+  AddPartial(src, stride, partial_lo, partial_hi);
+
+  cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
+  cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
+
+  const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable),
+                                     LoadUnaligned16(kCdefDivisionTable + 4)};
+
+  cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+  cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+  const __m128i division_table_odd[2] = {
+      LoadAligned16(kCdefDivisionTableOddPadded),
+      LoadAligned16(kCdefDivisionTableOddPadded + 4)};
+
+  cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
+  cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
+  cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd);
+  cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+                          __m128i* output, const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+  output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+  output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+  output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+                    __m128i* output, const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+                      src - y_0 * stride + stride - x_0);
+  output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+                      src + y_0 * stride + stride + x_0);
+  output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+                      src - y_1 * stride + stride - x_1);
+  output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+                      src + y_1 * stride + stride + x_1);
+}
+
+inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
+                         const __m128i& damping, const __m128i& threshold) {
+  const __m128i diff = _mm_sub_epi16(pixel, reference);
+  const __m128i abs_diff = _mm_abs_epi16(diff);
+  // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+  //                    0, std::abs(diff))
+  const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const __m128i thresh_minus_shifted_diff =
+      _mm_subs_epu16(threshold, shifted_diff);
+  const __m128i clamp_abs_diff =
+      _mm_min_epi16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return _mm_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
+                                    const __m128i& tap, const __m128i& damping,
+                                    const __m128i& threshold) {
+  const __m128i constrained = Constrain(val, pixel, damping, threshold);
+  return _mm_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_SSE4_1(const uint16_t* src, const ptrdiff_t src_stride,
+                       const int height, const int primary_strength,
+                       const int secondary_strength, const int damping,
+                       const int direction, void* dest,
+                       const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+  }
+
+  const __m128i primary_tap_0 =
+      _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]);
+  const __m128i primary_tap_1 =
+      _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
+  const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
+  const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
+  const __m128i cdef_large_value_mask =
+      _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
+  const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
+  const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength);
+
+  int y = height;
+  do {
+    __m128i pixel;
+    if (width == 8) {
+      pixel = LoadUnaligned16(src);
+    } else {
+      pixel = LoadHi8(LoadLo8(src), src + src_stride);
+    }
+
+    __m128i min = pixel;
+    __m128i max = pixel;
+    __m128i sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      __m128i primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      if (clipping_required) {
+        min = _mm_min_epu16(min, primary_val[0]);
+        min = _mm_min_epu16(min, primary_val[1]);
+        min = _mm_min_epu16(min, primary_val[2]);
+        min = _mm_min_epu16(min, primary_val[3]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
+        const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
+        const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
+        max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
+      }
+
+      sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+                                 primary_damping_shift, primary_threshold);
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
+                                    primary_damping_shift, primary_threshold));
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
+                                    primary_damping_shift, primary_threshold));
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
+                                    primary_damping_shift, primary_threshold));
+    } else {
+      sum = _mm_setzero_si128();
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      __m128i secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      if (clipping_required) {
+        min = _mm_min_epu16(min, secondary_val[0]);
+        min = _mm_min_epu16(min, secondary_val[1]);
+        min = _mm_min_epu16(min, secondary_val[2]);
+        min = _mm_min_epu16(min, secondary_val[3]);
+        min = _mm_min_epu16(min, secondary_val[4]);
+        min = _mm_min_epu16(min, secondary_val[5]);
+        min = _mm_min_epu16(min, secondary_val[6]);
+        min = _mm_min_epu16(min, secondary_val[7]);
+
+        const __m128i max_s01 =
+            _mm_max_epu8(secondary_val[0], secondary_val[1]);
+        const __m128i max_s23 =
+            _mm_max_epu8(secondary_val[2], secondary_val[3]);
+        const __m128i max_s45 =
+            _mm_max_epu8(secondary_val[4], secondary_val[5]);
+        const __m128i max_s67 =
+            _mm_max_epu8(secondary_val[6], secondary_val[7]);
+        const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
+                                           _mm_max_epu8(max_s45, max_s67));
+        max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
+      }
+
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+    }
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+    // 8 + sum
+    sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    // (... - (sum < 0)) >> 4
+    sum = _mm_add_epi16(sum, sum_lt_0);
+    sum = _mm_srai_epi16(sum, 4);
+    // pixel + ...
+    sum = _mm_add_epi16(sum, pixel);
+    if (clipping_required) {
+      // Clip3
+      sum = _mm_min_epi16(sum, max);
+      sum = _mm_max_epi16(sum, min);
+    }
+
+    const __m128i result = _mm_packus_epi16(sum, sum);
+    if (width == 8) {
+      src += src_stride;
+      StoreLo8(dst, result);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      Store4(dst, result);
+      dst += dst_stride;
+      Store4(dst, _mm_srli_si128(result, 4));
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_SSE4_1;
+  dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/cdef_sse4.h b/src/dsp/x86/cdef_sse4.h
new file mode 100644
index 0000000..6631eb7
--- /dev/null
+++ b/src/dsp/x86/cdef_sse4.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h
new file mode 100644
index 0000000..4ce7de2
--- /dev/null
+++ b/src/dsp/x86/common_avx2.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <immintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+  // For compatibility with older gcc toolchains (< 8) use
+  // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+  // are implemented similarly to the following, clang uses a different method
+  // but no differences in assembly have been observed.
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+  dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+  return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m256i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+  if (over_read_in_bytes > 0) {
+    __m128i m = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+      m = _mm_srli_si128(m, 1);
+    }
+    const __m256i mask = (over_read_in_bytes < 16)
+                             ? SetrM128i(_mm_set1_epi8(-1), m)
+                             : SetrM128i(m, _mm_setzero_si128());
+    dst = _mm256_and_si256(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+                              const ptrdiff_t over_read_in_bytes,
+                              __m256i dst[2]) {
+  dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+  dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+                         over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+  _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+  assert(bits <= 16);
+  const __m256i v_bias_d =
+      _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+  return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_AVX2
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h
new file mode 100644
index 0000000..c510f8c
--- /dev/null
+++ b/src/dsp/x86/common_sse4.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#if 0
+#include <cinttypes>
+#include <cstdio>
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintReg(const __m128i r, const char* const name, int size) {
+  int n;
+  union {
+    __m128i r;
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i32[4];
+    uint64_t i64[2];
+  } tmp;
+  tmp.r = r;
+  fprintf(stderr, "%s\t: ", name);
+  if (size == 8) {
+    for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+  } else if (size == 16) {
+    for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+  } else if (size == 32) {
+    for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+  } else {
+    for (n = 0; n < 2; ++n)
+      fprintf(stderr, "%.16" PRIx64 " ", static_cast<uint64_t>(tmp.i64[n]));
+  }
+  fprintf(stderr, "\n");
+}
+
+inline void PrintReg(const int r, const char* const name) {
+  fprintf(stderr, "%s: %d\n", name, r);
+}
+
+inline void PrintRegX(const int r, const char* const name) {
+  fprintf(stderr, "%s: %.8x\n", name, r);
+}
+
+#define PR(var, N) PrintReg(var, #var, N)
+#define PD(var) PrintReg(var, #var);
+#define PX(var) PrintRegX(var, #var);
+#endif  // 0
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+  int16_t val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+  uint16_t val1;
+  uint16_t val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val1, val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+  const __m128 x =
+      _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+  return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m128i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    __m128i mask = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+      mask = _mm_srli_si128(mask, 1);
+    }
+    dst = _mm_and_si128(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+  _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+  assert(bits <= 16);
+  // Shift out all but the last bit.
+  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+  // Avg with zero will shift by 1 and round.
+  return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+  assert(bits <= 16);
+  const __m128i v_bias_d =
+      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+  static constexpr uint8_t kMask[32] = {
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  };
+
+  return LoadUnaligned16(kMask + n);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
new file mode 100644
index 0000000..3df2120
--- /dev/null
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -0,0 +1,534 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kHorizontalOffset = 3;
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
+  __m256i sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm256_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm256_add_epi16(sum, v_madd_65);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
+    const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = _mm256_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm256_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int filter_index>
+__m256i SumHorizontalTaps(const __m256i* const src,
+                          const __m256i* const v_tap) {
+  __m256i v_src[4];
+  const __m256i src_long = *src;
+  const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
+  const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
+    v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
+    v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
+    v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
+  }
+  return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index>
+__m256i SimpleHorizontalTaps(const __m256i* const src,
+                             const __m256i* const v_tap) {
+  __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm256_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                             const __m128i* const v_tap) {
+  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+  const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+  if (filter_index == 3) {
+    // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+    const __m128i v_src_43 = _mm_shuffle_epi8(
+        v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+    return v_sum_43;
+  }
+
+  // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+  const __m128i v_src_32 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+  // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+  const __m128i v_src_54 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(0x800f0f0e, 0x0e0d0d0c, 0x80070706, 0x06050504));
+  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+  return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  const __m128i sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+// Filter 2xh sizes.
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int /*width*/, const int height,
+                      const __m128i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // Horizontal passes only need to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (!is_compound) {
+      int y = 0;
+      do {
+        if (is_2d) {
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
+        } else {
+          const __m128i sum =
+              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+          Store2(dest8, sum);
+          dest8 += pred_stride;
+          Store2(dest8, _mm_srli_si128(sum, 4));
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y += 2;
+      } while (y < height - 1);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (filter_index == 3) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
+      }
+    }
+  }
+}
+
+// Filter widths >= 4.
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const __m256i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  if (width >= 32) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          // placeholder
+        } else {
+          // Load src used to calculate dest8[7:0] and dest8[23:16].
+          const __m256i src_long = LoadUnaligned32(&src[x]);
+          const __m256i result =
+              SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+          // Load src used to calculate dest8[15:8] and dest8[31:24].
+          const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
+          const __m256i result2 =
+              SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+          // Combine results and store.
+          StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
+        }
+        x += step * 4;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (--y != 0);
+  } else if (width == 16) {
+    int y = height;
+    do {
+      if (is_2d || is_compound) {
+        // placeholder
+      } else {
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
+                                           LoadUnaligned16(&src[src_stride]));
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i src_long2 = SetrM128i(
+            LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
+        const __m256i result2 =
+            SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+        const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
+        StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
+        StoreUnaligned16(&dest8[pred_stride],
+                         _mm256_extracti128_si256(packed_result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      if (is_2d || is_compound) {
+        // placeholder
+      } else {
+        const __m128i this_row = LoadUnaligned16(&src[0]);
+        const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(this_row, next_row);
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
+        StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+  } else {  // width == 4
+    int y = height;
+    do {
+      if (is_2d || is_compound) {
+        // placeholder
+      } else {
+        const __m128i this_row = LoadUnaligned16(&src[0]);
+        const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(this_row, next_row);
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        Store4(&dest8[0], _mm256_castsi256_si128(result));
+        Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m128i* v_tap) {
+  if (num_taps == 8) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    }
+  } else if (num_taps == 6) {
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
+    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    }
+  } else if (num_taps == 4) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    }
+  } else {  // num_taps == 2
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    }
+  }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m256i* v_tap) {
+  if (num_taps == 8) {
+    v_tap[0] = _mm256_broadcastw_epi16(*filter);                     // k1k0
+    v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+    v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+    v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6));  // k7k6
+    if (is_2d_vertical) {
+      // placeholder
+    }
+  } else if (num_taps == 6) {
+    v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1));  // k2k1
+    v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+    v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5));  // k6k5
+    if (is_2d_vertical) {
+      // placeholder
+    }
+  } else if (num_taps == 4) {
+    v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+    v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+    if (is_2d_vertical) {
+      // placeholder
+    }
+  } else {  // num_taps == 2
+    v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+    if (is_2d_vertical) {
+      // placeholder
+    }
+  }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
+    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+    const ptrdiff_t dst_stride, const int width, const int height,
+    const int filter_id, const int filter_index) {
+  assert(filter_id != 0);
+  __m128i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+    const ptrdiff_t dst_stride, const int width, const int height,
+    const int filter_id, const int filter_index) {
+  assert(filter_id != 0);
+  __m256i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<8, 8, 2, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 8, 1, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 8, 0, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+void ConvolveHorizontal_AVX2(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int horizontal_filter_index,
+                             const int /*vertical_filter_index*/,
+                             const int horizontal_filter_id,
+                             const int /*vertical_filter_id*/, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width > 2) {
+    DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                     horizontal_filter_id, filter_index);
+  } else {
+    // Use non avx2 version for smaller widths.
+    DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
+                        horizontal_filter_id, filter_index);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000..6179d98
--- /dev/null
+++ b/src/dsp/x86/convolve_avx2.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
new file mode 100644
index 0000000..3a0fff5
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -0,0 +1,2830 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+  __m128i sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm_add_epi16(sum, v_madd_65);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps(const uint8_t* const src,
+                          const __m128i* const v_tap) {
+  __m128i v_src[4];
+  const __m128i src_long = LoadUnaligned16(src);
+  const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
+  const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
+    v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
+    v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
+    v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps(const uint8_t* const src,
+                             const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16(const uint8_t* const src,
+                            const __m128i* const v_tap) {
+  const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                             const __m128i* const v_tap) {
+  const __m128i input0 = LoadLo8(&src[2]);
+  const __m128i input1 = LoadLo8(&src[2 + src_stride]);
+
+  if (filter_index == 3) {
+    // 03 04 04 05 05 06 06 07 ....
+    const __m128i input0_dup =
+        _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3);
+    // 13 14 14 15 15 16 16 17 ....
+    const __m128i input1_dup =
+        _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3);
+    const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup);
+    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+    return v_sum_43;
+  }
+
+  // 02 03 03 04 04 05 05 06 06 07 ....
+  const __m128i input0_dup =
+      _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1);
+  // 12 13 13 14 14 15 15 16 16 17 ....
+  const __m128i input1_dup =
+      _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1);
+  // 04 05 05 06 06 07 07 08 ...
+  const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4);
+  // 14 15 15 16 16 17 17 18 ...
+  const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4);
+  const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup);
+  const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54);
+  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+  return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  const __m128i sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const __m128i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // 4 tap filters are never used when width > 4.
+  if (num_taps != 4 && width > 4) {
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          const __m128i v_sum =
+              HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+          if (is_2d) {
+            StoreAligned16(&dest16[x], v_sum);
+          } else {
+            StoreUnaligned16(&dest16[x], v_sum);
+          }
+        } else {
+          const __m128i result =
+              SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+          StoreLo8(&dest8[x], result);
+        }
+        x += step;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (++y < height);
+    return;
+  }
+
+  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (width == 4) {
+      int y = 0;
+      do {
+        if (is_2d || is_compound) {
+          const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+          StoreLo8(dest16, v_sum);
+        } else {
+          const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+          Store4(&dest8[0], result);
+        }
+        src += src_stride;
+        dest8 += pred_stride;
+        dest16 += pred_stride;
+      } while (++y < height);
+      return;
+    }
+
+    if (!is_compound) {
+      int y = 0;
+      do {
+        if (is_2d) {
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
+        } else {
+          const __m128i sum =
+              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+          Store2(dest8, sum);
+          dest8 += pred_stride;
+          Store2(dest8, _mm_srli_si128(sum, 4));
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y += 2;
+      } while (y < height - 1);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (filter_index == 3) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
+      }
+    }
+  }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m128i* v_tap) {
+  if (num_taps == 8) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    }
+  } else if (num_taps == 6) {
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
+    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    }
+  } else if (num_taps == 4) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    }
+  } else {  // num_taps == 2
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+                                const __m128i* const taps) {
+  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m128i madd_lo =
+        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m128i madd_hi =
+        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const __m128i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m128i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned16(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned16(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned16(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
+    do {
+      srcs[next_row] = LoadAligned16(src_x);
+      src_x += src_stride;
+
+      const __m128i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned16(dst16 + x + y * dst_stride, sum);
+      } else {
+        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+    if (num_taps >= 6) {
+      srcs[4] = LoadAligned16(src);
+      src += 8;
+      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+      if (num_taps == 8) {
+        srcs[6] = LoadAligned16(src);
+        src += 8;
+        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+      }
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[num_taps] = LoadAligned16(src);
+    src += 8;
+    srcs[num_taps - 1] = _mm_unpacklo_epi64(
+        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      StoreUnaligned16(dst16, sum);
+      dst16 += 4 << 1;
+    } else {
+      const __m128i results = _mm_packus_epi16(sum, sum);
+      Store4(dst8, results);
+      dst8 += dst_stride;
+      Store4(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y += 2;
+  } while (y < height);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    if (num_taps == 8) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[next_row] = LoadAligned16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    } else if (num_taps == 4) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    } else if (num_taps == 6) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+    } else if (num_taps == 8) {
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+    }
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const __m128i results = _mm_packus_epi16(sum, sum);
+
+    Store2(dst8, results);
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 2));
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 4));
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 6));
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y += 4;
+  } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+    const ptrdiff_t dst_stride, const int width, const int height,
+    const int filter_id, const int filter_index) {
+  assert(filter_id != 0);
+  __m128i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<8, 8, 2, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 8, 1, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 8, 0, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+void Convolve2D_SSE4_1(const void* const reference,
+                       const ptrdiff_t reference_stride,
+                       const int horizontal_filter_index,
+                       const int vertical_filter_index,
+                       const int horizontal_filter_id,
+                       const int vertical_filter_id, const int width,
+                       const int height, void* prediction,
+                       const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+  __m128i v_src[4];
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+                    void* const dst, const ptrdiff_t dst_stride,
+                    const int width, const int height,
+                    const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m128i srcs[8];
+    srcs[0] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadLo8(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadLo8(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadLo8(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
+    do {
+      srcs[next_row] = LoadLo8(src_x);
+      src_x += src_stride;
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16 + x + y * dst_stride, results);
+      } else {
+        const __m128i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 10 11 12 13
+      const __m128i a = Load4(src);
+      // 00 01 02 03 10 11 12 13
+      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+      src += src_stride;
+      // 20 21 22 23
+      srcs[2] = Load4(src);
+      src += src_stride;
+      // 10 11 12 13 20 21 22 23
+      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+    int y = 0;
+    do {
+      // 30 31 32 33
+      const __m128i b = Load4(src);
+      // 20 21 22 23 30 31 32 33
+      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+      src += src_stride;
+      // 40 41 42 43
+      srcs[4] = Load4(src);
+      src += src_stride;
+      // 30 31 32 33 40 41 42 43
+      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    srcs[6] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+    int y = 0;
+    do {
+      // 50 51 52 53
+      const __m128i c = Load4(src);
+      // 40 41 42 43 50 51 52 53
+      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+      src += src_stride;
+      // 60 61 62 63
+      srcs[6] = Load4(src);
+      src += src_stride;
+      // 50 51 52 53 60 61 62 63
+      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    srcs[8] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+    // 50 51 52 53
+    const __m128i c = Load4(src);
+    // 40 41 42 43 50 51 52 53
+    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+    src += src_stride;
+    // 60 61 62 63
+    srcs[6] = Load4(src);
+    src += src_stride;
+    // 50 51 52 53 60 61 62 63
+    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+    int y = 0;
+    do {
+      // 70 71 72 73
+      const __m128i d = Load4(src);
+      // 60 61 62 63 70 71 72 73
+      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+      src += src_stride;
+      // 80 81 82 83
+      srcs[8] = Load4(src);
+      src += src_stride;
+      // 70 71 72 73 80 81 82 83
+      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y += 2;
+    } while (y < height);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 00 01 10 11
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41
+      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+      // This uses srcs[0]..srcs[1].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+      // This uses srcs[0]..srcs[3].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+    int y = 0;
+    do {
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+      // This uses srcs[0]..srcs[5].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 40 41 50 51
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    // 40 41 50 51 60 61
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+    // 20 21 30 31 40 41 50 51
+    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+    // 30 31 40 41 50 51 60 61
+    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+    int y = 0;
+    do {
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91 a0 a1
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+
+      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+      // 60 61 70 71 80 81 90 91
+      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+      // 70 71 80 81 90 91 a0 a1
+      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+      // This uses srcs[0]..srcs[7].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  }
+}
+
+void ConvolveVertical_SSE4_1(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int /*horizontal_filter_index*/,
+                             const int vertical_filter_index,
+                             const int /*horizontal_filter_id*/,
+                             const int vertical_filter_id, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  if (filter_index < 2) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else {
+    // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
+    // See convolve_neon.cc
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_SSE4(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int /*horizontal_filter_index*/,
+                               const int /*vertical_filter_index*/,
+                               const int /*horizontal_filter_id*/,
+                               const int /*vertical_filter_id*/,
+                               const int width, const int height,
+                               void* prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int kRoundBitsVertical =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&src[x]);
+        const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src);
+        const __m128i v_src_ext_hi =
+            _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8));
+        const __m128i v_dest_lo =
+            _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
+        const __m128i v_dest_hi =
+            _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
+        // TODO(slavarnway): Investigate using aligned stores.
+        StoreUnaligned16(&dest[x], v_dest_lo);
+        StoreUnaligned16(&dest[x + 8], v_dest_hi);
+        x += 16;
+      } while (x < width);
+      src += src_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i v_src = LoadLo8(&src[0]);
+      const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+      const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+      StoreUnaligned16(&dest[0], v_dest);
+      src += src_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else { /* width == 4 */
+    int y = height;
+    do {
+      const __m128i v_src0 = Load4(&src[0]);
+      const __m128i v_src1 = Load4(&src[src_stride]);
+      const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1);
+      const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+      const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+      StoreLo8(&dest[0], v_dest);
+      StoreHi8(&dest[pred_stride], v_dest);
+      src += src_stride * 2;
+      dest += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void ConvolveCompoundVertical_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int vertical_filter_index,
+    const int /*horizontal_filter_id*/, const int vertical_filter_id,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  if (filter_index < 2) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else {
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  }
+}
+
+void ConvolveHorizontal_SSE4_1(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int horizontal_filter_index,
+                               const int /*vertical_filter_index*/,
+                               const int horizontal_filter_id,
+                               const int /*vertical_filter_id*/,
+                               const int width, const int height,
+                               void* prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int horizontal_filter_index, const int /*vertical_filter_index*/,
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_SSE4_1(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int horizontal_filter_index,
+                               const int vertical_filter_index,
+                               const int horizontal_filter_id,
+                               const int vertical_filter_id, const int width,
+                               const int height, void* prediction,
+                               const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  const ptrdiff_t dest_stride = width;
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+  // Filter 0
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+      {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+       {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+       {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  // Filter 1
+  alignas(16) static constexpr int8_t
+      kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+          {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+          {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+  // Filter 2
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+      {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+       {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+       {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+       {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+       {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+       {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+       {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+       {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+  // Filter 3
+  alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+  // Filter 4
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+      {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+  // Filter 5
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+  switch (filter_index) {
+    case 0:
+      output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+      break;
+    case 1:
+      // The term "mixed" refers to the fact that the outer taps have a mix of
+      // negative and positive values.
+      output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+      break;
+    case 2:
+      output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+      output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+      output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+      break;
+    case 3:
+      output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+      break;
+    case 4:
+      output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+      break;
+    default:
+      assert(filter_index == 5);
+      output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+      output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+      output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+      output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+      break;
+  }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+                                 __m128i* const source /*[num_taps >> 1]*/) {
+  const __m128i src_vals = LoadUnaligned16(src);
+  source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+  if (grade_x == 1) {
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+    }
+    if (num_taps > 4) {
+      source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+    }
+  } else {
+    assert(grade_x > 1);
+    assert(num_taps != 4);
+    // grade_x > 1 also means width >= 8 && num_taps != 4
+    const __m128i src_vals_ext = LoadLo8(src + 16);
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+                                   src_indices);
+      source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+                                   src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+                                   src_indices);
+    }
+  }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+                                  const __m128i* filter_taps,
+                                  __m128i* out_taps) {
+  const __m128i scale_index_offsets =
+      _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+  const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+  const __m128i filter_indices =
+      _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+                    filter_index_mask);
+  // Line up taps for maddubs_epi16.
+  // The unpack is also assumed to be lighter than shift+alignr.
+  for (int k = 0; k < (num_taps >> 1); ++k) {
+    const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+    const __m128i taps1 =
+        _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+    out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+  }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+  const __m128i src_indices16 =
+      _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+  const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+  return _mm_unpacklo_epi8(src_indices,
+                           _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
+                                    int width, int subpixel_x, int step_x,
+                                    int intermediate_height,
+                                    int16_t* intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps.
+  const int kernel_offset = (8 - num_taps) >> 1;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  __m128i filter_taps[num_taps];
+  GetHalfSubPixelFilter<filter_index>(filter_taps);
+  const __m128i index_steps =
+      _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+                      _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+  __m128i taps[num_taps >> 1];
+  __m128i source[num_taps >> 1];
+  int p = subpixel_x;
+  // Case when width <= 4 is possible.
+  if (filter_index >= 3) {
+    if (filter_index > 3 || width <= 4) {
+      const uint8_t* src_x =
+          &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+      // Only add steps to the 10-bit truncated p to avoid overflow.
+      const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+      const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+      PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+      const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+      int y = intermediate_height;
+      do {
+        // Load and line up source values with the taps. Width 4 means no need
+        // to load extended source.
+        PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+                                                      source);
+
+        StoreLo8(intermediate, RightShiftWithRounding_S16(
+                                   SumOnePassTaps<filter_index>(source, taps),
+                                   kInterRoundBitsHorizontal - 1));
+        src_x += src_stride;
+        intermediate += kIntermediateStride;
+      } while (--y != 0);
+      return;
+    }
+  }
+
+  // |width| >= 8
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+    const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+    PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+    const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+    int y = intermediate_height;
+    do {
+      // For each x, a lane of src_k[k] contains src_x[k].
+      PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+      // Shift by one less because the taps are halved.
+      StoreAligned16(
+          intermediate_x,
+          RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
+                                     kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+  // Avoid overreading the filter due to starting at kernel_offset.
+  // The only danger of overread is in the final filter, which has 4 taps.
+  const __m128i filter =
+      _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+  output[0] = _mm_shuffle_epi32(filter, 0);
+  if (num_taps > 2) {
+    output[1] = _mm_shuffle_epi32(filter, 0x55);
+  }
+  if (num_taps > 4) {
+    output[2] = _mm_shuffle_epi32(filter, 0xAA);
+  }
+  if (num_taps > 6) {
+    output[3] = _mm_shuffle_epi32(filter, 0xFF);
+  }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+                                 const __m128i* taps) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+  }
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+                             const __m128i* taps_hi) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+#if LIBGAV1_MSAN
+__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
+#else
+inline void ConvolveVerticalScale(
+#endif
+    const int16_t* src, const int width, const int subpixel_y,
+    const int filter_index, const int step_y, const int height, void* dest,
+    const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  constexpr int kernel_offset = (8 - num_taps) / 2;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  auto* dest16_y = static_cast<uint16_t*>(dest);
+  auto* dest_y = static_cast<uint8_t*>(dest);
+  __m128i s[num_taps];
+
+  int p = subpixel_y & 1023;
+  int y = height;
+  if (width_class <= 4) {
+    __m128i filter_taps_lo[num_taps >> 1];
+    __m128i filter_taps_hi[num_taps >> 1];
+    do {  // y > 0
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadLo8(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter0 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadHi8(s[i], src_y + i * src_stride);
+      }
+      filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter1 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+          s, filter_taps_lo, filter_taps_hi);
+      if (is_compound) {
+        assert(width_class > 2);
+        StoreLo8(dest16_y, sums);
+        dest16_y += dest_stride;
+        StoreHi8(dest16_y, sums);
+        dest16_y += dest_stride;
+      } else {
+        const __m128i result = _mm_packus_epi16(sums, sums);
+        if (width_class == 2) {
+          Store2(dest_y, result);
+          dest_y += dest_stride;
+          Store2(dest_y, _mm_srli_si128(result, 4));
+        } else {
+          Store4(dest_y, result);
+          dest_y += dest_stride;
+          Store4(dest_y, _mm_srli_si128(result, 4));
+        }
+        dest_y += dest_stride;
+      }
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  // |width_class| >= 8
+  __m128i filter_taps[num_taps >> 1];
+  do {  // y > 0
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    const int8_t* filter =
+        kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+    PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+    int x = 0;
+    do {  // x < width
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadUnaligned16(src_y + i * src_stride);
+      }
+
+      const __m128i sums =
+          Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+      if (is_compound) {
+        StoreUnaligned16(dest16_y + x, sums);
+      } else {
+        StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums));
+      }
+      x += 8;
+      src_y += 8;
+    } while (x < width);
+    p += step_y;
+    dest_y += dest_stride;
+    dest16_y += dest_stride;
+  } while (--y != 0);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* const reference,
+                            const ptrdiff_t reference_stride,
+                            const int horizontal_filter_index,
+                            const int vertical_filter_index,
+                            const int subpixel_x, const int subpixel_y,
+                            const int step_x, const int step_y, const int width,
+                            const int height, void* prediction,
+                            const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  // TODO(petersonab): Reduce intermediate block stride to width to make smaller
+  // blocks faster.
+  alignas(16) int16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)];
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // Similarly for height.
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // second register and alignr in order to gather all filter inputs.
+  // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+  switch (horiz_filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+
+      } else {
+        ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+      break;
+    default:
+      assert(horiz_filter_index == 5);
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+  }
+
+  // Vertical filter.
+  intermediate = intermediate_result;
+  switch (vert_filter_index) {
+    case 0:
+    case 1:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<6, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<6, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<6, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    case 2:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<8, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<8, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<8, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    case 3:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<2, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<2, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<2, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    default:
+      assert(vert_filter_index == 4 || vert_filter_index == 5);
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<4, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<4, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<4, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+  }
+}
+
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+  const __m128i left = LoadUnaligned16(src);
+  const __m128i right = LoadUnaligned16(src + 1);
+  StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+                                     const ptrdiff_t src_stride,
+                                     const int height, uint8_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i left = LoadLo8(src);
+      const __m128i right = LoadLo8(src + 1);
+      StoreLo8(dest, _mm_avg_epu8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 4) {
+    int y = height;
+    do {
+      __m128i left = Load4(src);
+      __m128i right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    assert(width == 2);
+    __m128i left = _mm_setzero_si128();
+    __m128i right = _mm_setzero_si128();
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<1>(src, left);
+      right = Load2<1>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 2));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* dst, const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  __m128i row[8], below[8];
+
+  row[0] = LoadUnaligned16(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = LoadUnaligned16(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = LoadUnaligned16(src);
+      src += 16;
+      row[3] = LoadUnaligned16(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = LoadUnaligned16(src);
+        src += 16;
+        row[5] = LoadUnaligned16(src);
+        src += 16;
+        row[6] = LoadUnaligned16(src);
+        src += 16;
+        row[7] = LoadUnaligned16(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = LoadUnaligned16(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = LoadUnaligned16(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = LoadUnaligned16(src);
+        src += 16;
+        below[3] = LoadUnaligned16(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = LoadUnaligned16(src);
+          src += 16;
+          below[5] = LoadUnaligned16(src);
+          src += 16;
+          below[6] = LoadUnaligned16(src);
+          src += 16;
+          below[7] = LoadUnaligned16(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    __m128i row, below;
+    row = LoadLo8(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = LoadLo8(src);
+      src += reference_stride;
+
+      StoreLo8(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else if (width == 4) {
+    __m128i row = Load4(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      __m128i below = Load4(src);
+      src += reference_stride;
+
+      Store4(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else {
+    assert(width == 2);
+    __m128i row = Load2(src);
+    __m128i below = _mm_setzero_si128();
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = Load2<0>(src, below);
+      src += reference_stride;
+
+      Store2(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+  const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+  const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+  return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+  const __m128i a = _mm_add_epi16(v0, v1);
+  const __m128i b = _mm_srli_epi16(a, 1);
+  // Use avg here to shift right by 1 with round.
+  const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+  return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+                             const int height, uint8_t* dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  __m128i row[16];
+  row[0] = LoadU8AndAddLong(src, src + 1);
+  if (width >= 16) {
+    src += 8;
+    row[1] = LoadU8AndAddLong(src, src + 1);
+    if (width >= 32) {
+      src += 8;
+      row[2] = LoadU8AndAddLong(src, src + 1);
+      src += 8;
+      row[3] = LoadU8AndAddLong(src, src + 1);
+      if (width >= 64) {
+        src += 8;
+        row[4] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[5] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[6] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[7] = LoadU8AndAddLong(src, src + 1);
+        if (width == 128) {
+          src += 8;
+          row[8] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[9] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[10] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[11] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[12] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[13] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[14] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[15] = LoadU8AndAddLong(src, src + 1);
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+    StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+      StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 4) {
+    __m128i left = _mm_cvtepu8_epi16(Load4(src));
+    __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+    src += reference_stride;
+
+    __m128i row = _mm_add_epi16(left, right);
+
+    int y = height;
+    do {
+      left = Load4(src);
+      right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  } else {
+    __m128i left = Load2(src);
+    __m128i right = Load2(src + 1);
+    src += reference_stride;
+
+    __m128i row =
+        _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<2>(src, left);
+      right = Load2<2>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+  dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/convolve_sse4.h b/src/dsp/x86/convolve_sse4.h
new file mode 100644
index 0000000..d6c3155
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
new file mode 100644
index 0000000..deb57ef
--- /dev/null
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -0,0 +1,230 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+                                       const __m128i& pred1,
+                                       const __m128i& weights) {
+  // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+  const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1);
+  const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights);
+  const __m128i result_lo =
+      RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4);
+
+  const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1);
+  const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights);
+  const __m128i result_hi =
+      RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4);
+
+  return _mm_packs_epi32(result_lo, result_hi);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+    const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+  for (int y = 0; y < height; y += 4) {
+    // TODO(b/150326556): Use larger loads.
+    const __m128i src_00 = LoadLo8(pred_0);
+    const __m128i src_10 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    __m128i src_0 = LoadHi8(src_00, pred_0);
+    __m128i src_1 = LoadHi8(src_10, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights);
+
+    const __m128i src_01 = LoadLo8(pred_0);
+    const __m128i src_11 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    src_0 = LoadHi8(src_01, pred_0);
+    src_1 = LoadHi8(src_11, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights);
+
+    const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+    Store4(dst, result_pixels);
+    dst += dest_stride;
+    const int result_1 = _mm_extract_epi32(result_pixels, 1);
+    memcpy(dst, &result_1, sizeof(result_1));
+    dst += dest_stride;
+    const int result_2 = _mm_extract_epi32(result_pixels, 2);
+    memcpy(dst, &result_2, sizeof(result_2));
+    dst += dest_stride;
+    const int result_3 = _mm_extract_epi32(result_pixels, 3);
+    memcpy(dst, &result_3, sizeof(result_3));
+    dst += dest_stride;
+  }
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+    const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+    const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+    StoreLo8(dst, result_pixels);
+    dst += dest_stride;
+    StoreHi8(dst, result_pixels);
+    dst += dest_stride;
+  }
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+    const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height, void* const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+      const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+      const __m128i res_lo =
+          ComputeWeightedAverage8(src_0_lo, src_1_lo, weights);
+
+      const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+      const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+      const __m128i res_hi =
+          ComputeWeightedAverage8(src_0_hi, src_1_hi, weights);
+
+      StoreUnaligned16(dst + x, _mm_packus_epi16(res_lo, res_hi));
+      x += 16;
+    } while (x < width);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
+                                  const void* prediction_1,
+                                  const uint8_t weight_0,
+                                  const uint8_t weight_1, const int width,
+                                  const int height, void* const dest,
+                                  const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dest_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dest_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                          dest, dest_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dest_stride);
+        return;
+      case 8:
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dest_stride);
+        return;
+      case 16:
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dest_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dest_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+                                    height, dest, dest_stride);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(DistanceWeightedBlend)
+  dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h
new file mode 100644
index 0000000..8646eca
--- /dev/null
+++ b/src/dsp/x86/distance_weighted_blend_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc
new file mode 100644
index 0000000..4a8658d
--- /dev/null
+++ b/src/dsp/x86/intra_edge_sse4.cc
@@ -0,0 +1,270 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>  // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+    {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxEdgeBufferSize = 129;
+
+// This function applies the kernel [0, 4, 8, 4, 0] to 12 values.
+// Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to
+// write as overlapping sets of 8-bytes.
+inline void ComputeKernel1Store12(uint8_t* dest, const uint8_t* source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+  // Samples matched with the '4' tap, expanded to 16-bit.
+  const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+  const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+  // Samples matched with the '8' tap, expanded to 16-bit.
+  const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+  const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+
+  // Apply the taps by shifting.
+  const __m128i outers4_lo = _mm_slli_epi16(outers_lo, 2);
+  const __m128i outers4_hi = _mm_slli_epi16(outers_hi, 2);
+  const __m128i centers8_lo = _mm_slli_epi16(centers_lo, 3);
+  const __m128i centers8_hi = _mm_slli_epi16(centers_hi, 3);
+  // Move latter 4x values down to add with first 4x values for each output.
+  const __m128i partial_sums_lo =
+      _mm_add_epi16(outers4_lo, _mm_srli_si128(outers4_lo, 4));
+  const __m128i partial_sums_hi =
+      _mm_add_epi16(outers4_hi, _mm_srli_si128(outers4_hi, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_lo = RightShiftWithRounding_U16(
+      _mm_add_epi16(partial_sums_lo, centers8_lo), 4);
+  const __m128i sums_hi = RightShiftWithRounding_U16(
+      _mm_add_epi16(partial_sums_hi, centers8_hi), 4);
+
+  const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+  const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+  const __m128i result =
+      _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+  StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [0, 5, 6, 5, 0] to 12 values.
+// Assumes |edge| has 8 packed byte values, and that the 2 invalid values will
+// be overwritten or safely discarded.
+inline void ComputeKernel2Store12(uint8_t* dest, const uint8_t* source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+  const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+  const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+  const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+  const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+  // Samples matched with the '5' tap, expanded to 16-bit. Add x + 4x.
+  const __m128i outers5_lo =
+      _mm_add_epi16(outers_lo, _mm_slli_epi16(outers_lo, 2));
+  const __m128i outers5_hi =
+      _mm_add_epi16(outers_hi, _mm_slli_epi16(outers_hi, 2));
+  // Samples matched with the '6' tap, expanded to 16-bit. Add 2x + 4x.
+  const __m128i centers6_lo = _mm_add_epi16(_mm_slli_epi16(centers_lo, 1),
+                                            _mm_slli_epi16(centers_lo, 2));
+  const __m128i centers6_hi = _mm_add_epi16(_mm_slli_epi16(centers_hi, 1),
+                                            _mm_slli_epi16(centers_hi, 2));
+  // Move latter 5x values down to add with first 5x values for each output.
+  const __m128i partial_sums_lo =
+      _mm_add_epi16(outers5_lo, _mm_srli_si128(outers5_lo, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_lo = RightShiftWithRounding_U16(
+      _mm_add_epi16(centers6_lo, partial_sums_lo), 4);
+  // Shift latter 5x values to add with first 5x values for each output.
+  const __m128i partial_sums_hi =
+      _mm_add_epi16(outers5_hi, _mm_srli_si128(outers5_hi, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_hi = RightShiftWithRounding_U16(
+      _mm_add_epi16(centers6_hi, partial_sums_hi), 4);
+  // First 6 values are valid outputs.
+  const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+  const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+  const __m128i result =
+      _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+  StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [2, 4, 4, 4, 2] to 8 values.
+inline void ComputeKernel3Store8(uint8_t* dest, const uint8_t* source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 4);
+  // Finish |edge_lo| life cycle quickly.
+  // Multiply for 2x.
+  const __m128i source2_lo = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_lo), 1);
+  // Multiply 2x by 2 and align.
+  const __m128i source4_lo = _mm_srli_si128(_mm_slli_epi16(source2_lo, 1), 2);
+  // Finish |source2| life cycle quickly.
+  // Move latter 2x values down to add with first 2x values for each output.
+  __m128i sum = _mm_add_epi16(source2_lo, _mm_srli_si128(source2_lo, 8));
+  // First 4x values already aligned to add with running total.
+  sum = _mm_add_epi16(sum, source4_lo);
+  // Move second 4x values down to add with running total.
+  sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 2));
+  // Move third 4x values down to add with running total.
+  sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 4));
+  // Multiply for 2x.
+  const __m128i source2_hi = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_hi), 1);
+  // Multiply 2x by 2 and align.
+  const __m128i source4_hi = _mm_srli_si128(_mm_slli_epi16(source2_hi, 1), 2);
+  // Move latter 2x values down to add with first 2x values for each output.
+  __m128i sum_hi = _mm_add_epi16(source2_hi, _mm_srli_si128(source2_hi, 8));
+  // First 4x values already aligned to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, source4_hi);
+  // Move second 4x values down to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 2));
+  // Move third 4x values down to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 4));
+
+  // Because we have only 8 values here, it is safe to align before packing down
+  // to 8-bit without losing data.
+  sum = _mm_alignr_epi8(sum_hi, _mm_slli_si128(sum, 8), 8);
+  sum = RightShiftWithRounding_U16(sum, 4);
+  StoreLo8(dest, _mm_packus_epi16(sum, sum));
+}
+
+void IntraEdgeFilter_SSE4_1(void* buffer, int size, int strength) {
+  uint8_t edge[kMaxEdgeBufferSize + 4];
+  memcpy(edge, buffer, size);
+  auto* dst_buffer = static_cast<uint8_t*>(buffer);
+
+  // Only process |size| - 1 elements. Nothing to do in this case.
+  if (size == 1) return;
+
+  int i = 0;
+  switch (strength) {
+    case 1:
+      // To avoid overwriting, we stop short from the total write size plus the
+      // initial offset. In this case 12 valid values are written in two blocks
+      // of 8 bytes each.
+      for (; i < size - 17; i += 12) {
+        ComputeKernel1Store12(dst_buffer + i + 1, edge + i);
+      }
+      break;
+    case 2:
+      // See the comment for case 1.
+      for (; i < size - 17; i += 12) {
+        ComputeKernel2Store12(dst_buffer + i + 1, edge + i);
+      }
+      break;
+    default:
+      assert(strength == 3);
+      // The first filter input is repeated for taps of value 2 and 4.
+      dst_buffer[1] = RightShiftWithRounding(
+          (6 * edge[0] + 4 * edge[1] + 4 * edge[2] + 2 * edge[3]), 4);
+      // In this case, one block of 8 bytes is written in each iteration, with
+      // an offset of 2.
+      for (; i < size - 10; i += 8) {
+        ComputeKernel3Store8(dst_buffer + i + 2, edge + i);
+      }
+  }
+  const int kernel_index = strength - 1;
+  for (int final_index = Clip3(i, 1, size - 2); final_index < size;
+       ++final_index) {
+    int sum = 0;
+    for (int j = 0; j < kKernelTaps; ++j) {
+      const int k = Clip3(final_index + j - 2, 0, size - 1);
+      sum += kKernels[kernel_index][j] * edge[k];
+    }
+    dst_buffer[final_index] = RightShiftWithRounding(sum, 4);
+  }
+}
+
+constexpr int kMaxUpsampleSize = 16;
+
+// Applies the upsampling kernel [-1, 9, 9, -1] to alternating pixels, and
+// interleaves the results with the original values. This implementation assumes
+// that it is safe to write the maximum number of upsampled pixels (32) to the
+// edge buffer, even when |size| is small.
+void IntraEdgeUpsampler_SSE4_1(void* buffer, int size) {
+  assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+  auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+  uint8_t temp[kMaxUpsampleSize + 8];
+  temp[0] = temp[1] = pixel_buffer[-1];
+  memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+  temp[size + 2] = pixel_buffer[size - 1];
+
+  pixel_buffer[-2] = temp[0];
+  const __m128i data = LoadUnaligned16(temp);
+  const __m128i src_lo = _mm_cvtepu8_epi16(data);
+  const __m128i src_hi = _mm_unpackhi_epi8(data, _mm_setzero_si128());
+  const __m128i src9_hi = _mm_add_epi16(src_hi, _mm_slli_epi16(src_hi, 3));
+  const __m128i src9_lo = _mm_add_epi16(src_lo, _mm_slli_epi16(src_lo, 3));
+  __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo);
+  sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4));
+  sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6));
+  sum_lo = RightShiftWithRounding_S16(sum_lo, 4);
+  const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo),
+                                              _mm_srli_si128(data, 2));
+  StoreUnaligned16(pixel_buffer - 1, result_lo);
+  if (size > 8) {
+    const __m128i src_hi_extra = _mm_cvtepu8_epi16(LoadLo8(temp + 16));
+    const __m128i src9_hi_extra =
+        _mm_add_epi16(src_hi_extra, _mm_slli_epi16(src_hi_extra, 3));
+    __m128i sum_hi =
+        _mm_sub_epi16(_mm_alignr_epi8(src9_hi_extra, src9_hi, 2), src_hi);
+    sum_hi = _mm_add_epi16(sum_hi, _mm_alignr_epi8(src9_hi_extra, src9_hi, 4));
+    sum_hi = _mm_sub_epi16(sum_hi, _mm_alignr_epi8(src_hi_extra, src_hi, 6));
+    sum_hi = RightShiftWithRounding_S16(sum_hi, 4);
+    const __m128i result_hi =
+        _mm_unpacklo_epi8(_mm_packus_epi16(sum_hi, sum_hi), LoadLo8(temp + 10));
+    StoreUnaligned16(pixel_buffer + 15, result_hi);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeFilter)
+  dsp->intra_edge_filter = IntraEdgeFilter_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeUpsampler)
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void IntraEdgeInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intra_edge_sse4.h b/src/dsp/x86/intra_edge_sse4.h
new file mode 100644
index 0000000..6ed4d40
--- /dev/null
+++ b/src/dsp/x86/intra_edge_sse4.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
new file mode 100644
index 0000000..fac1556
--- /dev/null
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -0,0 +1,976 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+                                   __m128i alpha_sign, __m128i dc_q0) {
+  __m128i ac_q3 = LoadUnaligned16(input);
+  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+template <int width, int height>
+void CflIntraPredictor_SSE4_1(
+    void* const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i alpha_sign = _mm_set1_epi16(alpha);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  auto* row = reinterpret_cast<const __m128i*>(luma);
+  const int kCflLumaBufferStrideLog2_16i = 5;
+  const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3;
+  const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+  const __m128i dc_val = _mm_set1_epi16(dst[0]);
+  do {
+    __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+    if (width < 16) {
+      res = _mm_packus_epi16(res, res);
+      if (width == 4) {
+        Store4(dst, res);
+      } else {
+        StoreLo8(dst, res);
+      }
+    } else {
+      __m128i next =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      res = _mm_packus_epi16(res, next);
+      StoreUnaligned16(dst, res);
+      if (width == 32) {
+        res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+        next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+        res = _mm_packus_epi16(res, next);
+        StoreUnaligned16(dst + 16, res);
+      }
+    }
+    dst += stride;
+  } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint8_t*>(source);
+  __m128i sum = _mm_setzero_si128();
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i samples;
+  int y = 0;
+  do {
+    samples = Load4(src);
+    src += stride;
+    int src_bytes;
+    memcpy(&src_bytes, src, 4);
+    samples = _mm_insert_epi32(samples, src_bytes, 1);
+    src += stride;
+    samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3);
+    StoreLo8(luma_ptr, samples);
+    luma_ptr += kCflLumaBufferStride;
+    StoreHi8(luma_ptr, samples);
+    luma_ptr += kCflLumaBufferStride;
+
+    // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for
+    // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits.
+    sum = _mm_add_epi16(sum, samples);
+    y += 2;
+  } while (y < visible_height);
+
+  if (!is_inside) {
+    int y = visible_height;
+    do {
+      StoreHi8(luma_ptr, samples);
+      luma_ptr += kCflLumaBufferStride;
+      sum = _mm_add_epi16(sum, samples);
+      ++y;
+    } while (y < block_height);
+  }
+
+  __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero);
+  sum = _mm_cvtepu16_epi32(sum);
+  sum = _mm_add_epi32(sum, sum_tmp);
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      sum, block_height_log2 + 2 /* log2 of width 4 */);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 4;
+
+  if (block_height <= max_luma_height && block_width <= max_luma_width) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2, bool inside>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 5, "");
+  const int block_height = 1 << block_height_log2, block_width = 8;
+  const int visible_height = max_luma_height;
+  const int invisible_width = inside ? 0 : block_width - max_luma_width;
+  const int visible_width = max_luma_width;
+  const __m128i blend_mask =
+      inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width);
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  // Since the maximum height is 32, if we split them by parity, each one only
+  // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can
+  // store them in 16 bits without casting to 32 bits.
+  __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128();
+  __m128i sum;
+  __m128i samples1;
+
+  int y = 0;
+  do {
+    __m128i samples0 = LoadLo8(src);
+    if (!inside) {
+      const __m128i border0 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+      samples0 = _mm_blendv_epi8(samples0, border0, blend_mask);
+    }
+    src += stride;
+    samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3);
+    StoreUnaligned16(luma_ptr, samples0);
+    luma_ptr += kCflLumaBufferStride;
+
+    sum_even = _mm_add_epi16(sum_even, samples0);
+
+    samples1 = LoadLo8(src);
+    if (!inside) {
+      const __m128i border1 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+      samples1 = _mm_blendv_epi8(samples1, border1, blend_mask);
+    }
+    src += stride;
+    samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3);
+    StoreUnaligned16(luma_ptr, samples1);
+    luma_ptr += kCflLumaBufferStride;
+
+    sum_odd = _mm_add_epi16(sum_odd, samples1);
+    y += 2;
+  } while (y < visible_height);
+
+  if (!inside) {
+    for (int y = visible_height; y < block_height; y += 2) {
+      sum_even = _mm_add_epi16(sum_even, samples1);
+      StoreUnaligned16(luma_ptr, samples1);
+      luma_ptr += kCflLumaBufferStride;
+
+      sum_odd = _mm_add_epi16(sum_odd, samples1);
+      StoreUnaligned16(luma_ptr, samples1);
+      luma_ptr += kCflLumaBufferStride;
+    }
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero),
+                      _mm_cvtepu16_epi32(sum_even));
+  sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero));
+  sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd));
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      sum, block_height_log2 + 3 /* log2 of width 8 */);
+  averages = _mm_shuffle_epi8(averages, dup16);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+// This function will only work for block_width 16 and 32.
+template <int block_width_log2, int block_height_log2, bool inside>
+void CflSubsampler444_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 1 << block_width_log2;
+
+  const int visible_height = max_luma_height;
+  const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width);
+  const int invisible_width_16 = 16 - visible_width_16;
+  const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16);
+  const int visible_width_32 = inside ? 32 : max_luma_width;
+  const int invisible_width_32 = 32 - visible_width_32;
+  const __m128i blend_mask_32 =
+      MaskHighNBytes(std::min(16, invisible_width_32));
+
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i zero = _mm_setzero_si128();
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  __m128i sum = _mm_setzero_si128();
+
+  __m128i samples0, samples1;
+  __m128i samples2, samples3;
+  __m128i inner_sum_lo, inner_sum_hi;
+  int y = 0;
+  do {
+#if LIBGAV1_MSAN  // We can load uninitialized values here. Even though they are
+                  // then masked off by blendv, MSAN isn't smart enough to
+                  // understand that. So we switch to a C implementation here.
+    uint16_t c_arr[16];
+    for (int x = 0; x < 16; x++) {
+      const int x_index = std::min(x, visible_width_16 - 1);
+      c_arr[x] = src[x_index] << 3;
+    }
+    samples0 = LoadUnaligned16(c_arr);
+    samples1 = LoadUnaligned16(c_arr + 8);
+    static_cast<void>(blend_mask_16);
+#else
+    __m128i samples01 = LoadUnaligned16(src);
+
+    if (!inside) {
+      const __m128i border16 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width_16 - 1]));
+      samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16);
+    }
+    samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
+    samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
+#endif  // LIBGAV1_MSAN
+
+    StoreUnaligned16(luma_ptr, samples0);
+    StoreUnaligned16(luma_ptr + 8, samples1);
+    __m128i inner_sum = _mm_add_epi16(samples0, samples1);
+
+    if (block_width == 32) {
+#if LIBGAV1_MSAN  // We can load uninitialized values here. Even though they are
+                  // then masked off by blendv, MSAN isn't smart enough to
+                  // understand that. So we switch to a C implementation here.
+      uint16_t c_arr[16];
+      for (int x = 16; x < 32; x++) {
+        const int x_index = std::min(x, visible_width_32 - 1);
+        c_arr[x - 16] = src[x_index] << 3;
+      }
+      samples2 = LoadUnaligned16(c_arr);
+      samples3 = LoadUnaligned16(c_arr + 8);
+      static_cast<void>(blend_mask_32);
+#else
+      __m128i samples23 = LoadUnaligned16(src + 16);
+      if (!inside) {
+        const __m128i border32 =
+            _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
+        samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32);
+      }
+      samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
+      samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
+#endif  // LIBGAV1_MSAN
+
+      StoreUnaligned16(luma_ptr + 16, samples2);
+      StoreUnaligned16(luma_ptr + 24, samples3);
+      inner_sum = _mm_add_epi16(samples2, inner_sum);
+      inner_sum = _mm_add_epi16(samples3, inner_sum);
+    }
+
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    sum = _mm_add_epi32(sum, inner_sum_lo);
+    sum = _mm_add_epi32(sum, inner_sum_hi);
+    luma_ptr += kCflLumaBufferStride;
+    src += stride;
+  } while (++y < visible_height);
+
+  if (!inside) {
+    for (int y = visible_height; y < block_height;
+         luma_ptr += kCflLumaBufferStride, ++y) {
+      sum = _mm_add_epi32(sum, inner_sum_lo);
+      StoreUnaligned16(luma_ptr, samples0);
+      sum = _mm_add_epi32(sum, inner_sum_hi);
+      StoreUnaligned16(luma_ptr + 8, samples1);
+      if (block_width == 32) {
+        StoreUnaligned16(luma_ptr + 16, samples2);
+        StoreUnaligned16(luma_ptr + 24, samples3);
+      }
+    }
+  }
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages =
+      RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2);
+  averages = _mm_shuffle_epi8(averages, dup16);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    for (int x = 0; x < block_width; x += 8) {
+      __m128i samples = LoadUnaligned16(&luma_ptr[x]);
+      StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages));
+    }
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 1 << block_width_log2;
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreLo8(luma_ptr, result);
+  StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+  return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreUnaligned16(luma_ptr, result);
+  return result;
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = 0;
+  do {
+    // Note that double sampling and converting to 16bit makes a row fill the
+    // vector.
+    const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+    const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+    __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+    const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y += 4;
+  } while (y < luma_height);
+  const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+  for (; y < block_height; ++y) {
+    StoreLo8(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+  return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+  const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+  return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint8_t*>(source);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = 0;
+
+  do {
+    const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row01 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row00);
+    src += stride;
+    const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row11 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row10);
+    src += stride;
+    const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+    __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row21 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row20);
+    src += stride;
+    const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row31 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row30);
+    src += stride;
+    const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+    const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row41 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row40);
+    src += stride;
+    const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row51 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row50);
+    src += stride;
+    const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+    const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row61 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row60);
+    src += stride;
+    const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row71 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row70);
+    src += stride;
+    const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+    const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y += 4;
+  } while (y < luma_height);
+  // Duplicate the final row downward to the end after max_luma_height.
+  const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+  const __m128i final_fill_to_sum1 =
+      _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+  const __m128i final_fill_to_sum =
+      _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+  for (; y < block_height; ++y) {
+    StoreUnaligned16(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+
+  int16_t* luma_ptr = luma[0];
+  __m128i final_row_result;
+  // Begin first y section, covering width up to 16.
+  int y = 0;
+  do {
+    const uint8_t* src_next = src + stride;
+    const __m128i samples_row0_lo = LoadUnaligned16(src);
+    const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo);
+    const __m128i samples_row01 = (max_luma_width >= 16)
+                                      ? _mm_unpackhi_epi8(samples_row0_lo, zero)
+                                      : LastRowSamples(samples_row00);
+    const __m128i samples_row0_hi = LoadUnaligned16(src + 16);
+    const __m128i samples_row02 = (max_luma_width >= 24)
+                                      ? _mm_cvtepu8_epi16(samples_row0_hi)
+                                      : LastRowSamples(samples_row01);
+    const __m128i samples_row03 = (max_luma_width == 32)
+                                      ? _mm_unpackhi_epi8(samples_row0_hi, zero)
+                                      : LastRowSamples(samples_row02);
+    const __m128i samples_row1_lo = LoadUnaligned16(src_next);
+    const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo);
+    const __m128i samples_row11 = (max_luma_width >= 16)
+                                      ? _mm_unpackhi_epi8(samples_row1_lo, zero)
+                                      : LastRowSamples(samples_row10);
+    const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16);
+    const __m128i samples_row12 = (max_luma_width >= 24)
+                                      ? _mm_cvtepu8_epi16(samples_row1_hi)
+                                      : LastRowSamples(samples_row11);
+    const __m128i samples_row13 = (max_luma_width == 32)
+                                      ? _mm_unpackhi_epi8(samples_row1_hi, zero)
+                                      : LastRowSamples(samples_row12);
+    const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+    const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+    const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+    __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_row_result =
+        StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    sum = _mm_add_epi16(sum, final_row_result);
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    src += stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (++y < luma_height);
+
+  // Because max_luma_width is at most 32, any values beyond x=16 will
+  // necessarily be duplicated.
+  if (block_width_log2 == 5) {
+    const __m128i wide_fill = LastRowResult(final_row_result);
+    // Multiply duplicated value by number of occurrences, height * 4, since
+    // there are 16 in each row and the value appears in the vector 4 times.
+    final_sum = _mm_add_epi32(
+        final_sum,
+        _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2));
+  }
+
+  // Begin second y section.
+  if (y < block_height) {
+    const __m128i final_fill0 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+    const __m128i final_fill1 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+    const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+    const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+    const __m128i final_fill_to_sum =
+        _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+    do {
+      StoreUnaligned16(luma_ptr, final_fill0);
+      StoreUnaligned16(luma_ptr + 8, final_fill1);
+      luma_ptr += kCflLumaBufferStride;
+
+      final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_width_log2 + block_height_log2);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples0 = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+    const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+    final_row_result = _mm_sub_epi16(samples1, averages);
+    StoreUnaligned16(luma_ptr + 8, final_row_result);
+  }
+  if (block_width_log2 == 5) {
+    int16_t* wide_luma_ptr = luma[0] + 16;
+    const __m128i wide_fill = LastRowResult(final_row_result);
+    for (int i = 0; i < block_height;
+         ++i, wide_luma_ptr += kCflLumaBufferStride) {
+      StoreUnaligned16(wide_luma_ptr, wide_fill);
+      StoreUnaligned16(wide_luma_ptr + 8, wide_fill);
+    }
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_SSE4_1<32, 32>;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
new file mode 100644
index 0000000..e944ea3
--- /dev/null
+++ b/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -0,0 +1,2662 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>  // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+    // block dimension = 4
+    255, 149, 85, 64,
+    // block dimension = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // block dimension = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // block dimension = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // block dimension = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+template <int y_mask>
+inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left,
+                                      const __m128i& weights,
+                                      const __m128i& scaled_top_right,
+                                      const __m128i& round) {
+  const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights);
+  const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+template <int y_mask>
+inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights,
+                                  const __m128i& scaled_bottom_left) {
+  const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask);
+  const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y);
+  const __m128i scaled_bottom_left_y =
+      _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+  return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y);
+}
+
+template <int y_mask>
+inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top,
+                                    const __m128i& weights,
+                                    const __m128i& scaled_bottom_left,
+                                    const __m128i& round) {
+  __m128i pred_sum =
+      SmoothVerticalSum4<y_mask>(top, weights, scaled_bottom_left);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8));
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
+                                     const __m128i& weights,
+                                     const __m128i& scaled_corner) {
+  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+  return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels,
+                                       const __m128i& weights,
+                                       const __m128i& scaled_corner,
+                                       const __m128i& round) {
+  const __m128i pred_sum =
+      SmoothDirectionalSum8(pixels, weights, scaled_corner);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8);
+  StoreLo8(dest, _mm_packus_epi16(pred, pred));
+}
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1,
+                                        const __m128i& pixels2,
+                                        const __m128i& weights1,
+                                        const __m128i& weights2,
+                                        const __m128i& scaled_corner1,
+                                        const __m128i& scaled_corner2,
+                                        const __m128i& round) {
+  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+  StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2));
+}
+
+template <int y_mask>
+inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top,
+                                const __m128i& left, const __m128i& weights_x,
+                                const __m128i& weights_y,
+                                const __m128i& scaled_bottom_left,
+                                const __m128i& scaled_top_right,
+                                const __m128i& round) {
+  const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x);
+  const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask);
+  const __m128i weighted_top = _mm_mullo_epi32(weight_y, top);
+  const __m128i scaled_bottom_left_y =
+      _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+  const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y);
+  const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top);
+  const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred);
+
+  // Equivalent to RightShiftWithRounding(pred[x][y], 9).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9);
+
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left,
+                              const int height, __m128i* pixels) {
+  if (height == 4) {
+    pixels[1] = Load4(left);
+  } else if (height == 8) {
+    pixels[1] = LoadLo8(left);
+  } else {
+    pixels[1] = LoadUnaligned16(left);
+  }
+
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  const __m128i top = _mm_cvtepu8_epi16(Load4(above));
+  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+  pixels[2] = _mm_set1_epi16(above[3]);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height,
+                               __m128i* weight_h, __m128i* weight_w) {
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i x_weights = Load4(weight_array);
+  weight_h[0] = _mm_cvtepu8_epi16(x_weights);
+  weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+  if (height == 8) {
+    const __m128i y_weights = LoadLo8(weight_array + 4);
+    weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+    weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i y_weights = LoadUnaligned16(weight_array + 12);
+    weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+    weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(y_weights, zero);
+    weight_h[3] = _mm_sub_epi16(scale, weight_h[2]);
+  }
+}
+
+inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
+                               const __m128i* weight_x, uint8_t* dst,
+                               const ptrdiff_t stride,
+                               const bool use_second_half) {
+  const __m128i round = _mm_set1_epi32(256);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero)
+                                       : _mm_unpacklo_epi8(pixel[1], zero);
+  __m128i y_select = _mm_set1_epi16(0x0100);
+
+  for (int i = 0; i < 8; ++i) {
+    const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+    const __m128i interleaved_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights);
+
+    __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select);
+    horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]);
+    __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]);
+
+    sum = _mm_add_epi32(vertical_pred, sum);
+    sum = _mm_add_epi32(sum, round);
+    sum = _mm_srai_epi32(sum, 9);
+
+    sum = _mm_shuffle_epi8(sum, cvtepi32_epi8);
+    Store4(dst, sum);
+    dst += stride;
+
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+// The interleaving approach has some overhead that causes it to underperform in
+// the 4x4 case.
+void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+                      const void* top_row, const void* left_column) {
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  const __m128i scale = _mm_set1_epi32(256);
+  // Fourth short is top_row[3].
+  const __m128i top_right = _mm_shuffle_epi32(top, 0xFF);
+  // Fourth short is left_column[3].
+  const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  auto* dst = static_cast<uint8_t*>(dest);
+  // AV1 spec 7.11.2.6 (3) describes the sum:
+  // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] +
+  // scaled_bottom[y] This could be a loop, but for the immediate value in the
+  // shuffles.
+  WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left,
+                         scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+}
+
+void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                      const void* top_row, const void* left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i weights_x[1];
+  __m128i weights_y[2];
+  LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x);
+  __m128i pixels[3];
+  LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+}
+
+void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                       const void* top_row, const void* left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i weights_x[1];
+  __m128i weights_y[4];
+  LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x);
+  __m128i pixels[3];
+  LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left,
+                              const int height, __m128i* pixels) {
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
+  pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left);
+  pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left);
+
+  pixels[3] = _mm_set1_epi16(above[7]);
+
+  if (height == 4) {
+    pixels[2] = Load4(left);
+  } else if (height == 8) {
+    pixels[2] = LoadLo8(left);
+  } else if (height == 16) {
+    pixels[2] = LoadUnaligned16(left);
+  } else {
+    pixels[2] = LoadUnaligned16(left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = LoadUnaligned16(left + 16);
+    pixels[7] = pixels[3];
+  }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height,
+                               __m128i* weight_w, __m128i* weight_h) {
+  const int offset = (height < 8) ? 0 : 4;
+  __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
+  weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+  const __m128i inverter = _mm_set1_epi16(256);
+  weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+
+  if (height == 4) {
+    loaded_weights = _mm_srli_si128(loaded_weights, 4);
+    __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights);
+    __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x);
+    weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x);
+    weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x);
+  } else {
+    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+  }
+
+  if (height == 16) {
+    const __m128i zero = _mm_setzero_si128();
+    loaded_weights = LoadUnaligned16(weight_array + 12);
+    weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+    weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero);
+    weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i weight_lo = LoadUnaligned16(weight_array + 28);
+    weight_h[0] = _mm_cvtepu8_epi16(weight_lo);
+    weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+    const __m128i weight_hi = LoadUnaligned16(weight_array + 44);
+    weight_h[4] = _mm_cvtepu8_epi16(weight_hi);
+    weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]);
+  }
+}
+
+inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
+                               const __m128i* weights_y, const int height,
+                               uint8_t* dst, const ptrdiff_t stride,
+                               const bool use_second_half) {
+  const __m128i round = _mm_set1_epi32(256);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero)
+                                       : _mm_unpacklo_epi8(pixels[2], zero);
+  __m128i y_select = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < height; ++i) {
+    const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+    const __m128i interleaved_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    const __m128i vertical_sum0 =
+        _mm_madd_epi16(pixels[0], interleaved_weights);
+    const __m128i vertical_sum1 =
+        _mm_madd_epi16(pixels[1], interleaved_weights);
+
+    __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select);
+    horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]);
+    const __m128i horizontal_sum0 =
+        _mm_madd_epi16(horizontal_pixels, weights_x[0]);
+    const __m128i horizontal_sum1 =
+        _mm_madd_epi16(horizontal_pixels, weights_x[1]);
+
+    __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0);
+    sum0 = _mm_add_epi32(sum0, round);
+    sum0 = _mm_srai_epi32(sum0, 9);
+
+    __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1);
+    sum1 = _mm_add_epi32(sum1, round);
+    sum1 = _mm_srai_epi32(sum1, 9);
+
+    sum0 = _mm_packus_epi16(sum0, sum1);
+    sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8);
+    StoreLo8(dst, sum0);
+    dst += stride;
+
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+                      const void* top_row, const void* left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels);
+
+  __m128i weights_x[2], weights_y[2];
+  LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
+}
+
+void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                      const void* top_row, const void* left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels);
+
+  __m128i weights_x[2], weights_y[2];
+  LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+}
+
+void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                       const void* top_row, const void* left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels);
+
+  __m128i weights_x[2], weights_y[4];
+  LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+}
+
+void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                       const void* top_row, const void* left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[8];
+  LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels);
+
+  __m128i weights_x[2], weights_y[8];
+  LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+  dst += stride << 3;
+  WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride,
+                     false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride,
+                     true);
+}
+
+template <int width, int height>
+void SmoothWxH(void* const dest, const ptrdiff_t stride,
+               const void* const top_row, const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
+  const uint8_t* const sm_weights_w = kSmoothWeights + width - 4;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value = _mm_set1_epi16(256);
+  const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]);
+  const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]);
+  const __m128i round = _mm_set1_epi32(256);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < height; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i scaled_bottom_left =
+        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i weight_left_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+    for (int x = 0; x < width; x += 8) {
+      const __m128i top_x = LoadLo8(top_ptr + x);
+      const __m128i weights_x = LoadLo8(sm_weights_w + x);
+      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x);
+      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+      // Here opposite weights and pixels are multiplied, where the order of
+      // interleaving is indicated in the names.
+      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+      // |scaled_bottom_left| is always scaled by the same weight each row, so
+      // we only derive |scaled_top_right| values here.
+      const __m128i inverted_weights_x =
+          _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x));
+      const __m128i scaled_top_right =
+          _mm_mullo_epi16(inverted_weights_x, top_right);
+      const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right);
+      const __m128i scaled_top_right_hi =
+          _mm_unpackhi_epi16(scaled_top_right, zero);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+      // The round value for RightShiftWithRounding was added with
+      // |scaled_bottom_left|.
+      pred_lo = _mm_srli_epi32(pred_lo, 9);
+      pred_hi = _mm_srli_epi32(pred_hi, 9);
+      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+    }
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride,
+                                const void* top_row, const void* left_column) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr));
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top[3]);
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top[3]);
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 8));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 12));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_select);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  __m128i y_mask = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+}
+
+void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  const __m128i left2 =
+      _mm_cvtepu8_epi16(LoadLo8(static_cast<const uint8_t*>(left_column) + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                  scaled_top_right3, scaled_top_right4, scale);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                  const void* const top_row,
+                                  const void* const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                  scaled_top_right3, scaled_top_right4, scale);
+      WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                  scaled_top_right5, scaled_top_right6, scale);
+      WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                  scaled_top_right7, scaled_top_right8, scale);
+      dst += stride;
+    }
+  }
+}
+
+inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left,
+                                      const int height, __m128i* pixels) {
+  __m128i top = Load4(above);
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  top = _mm_cvtepu8_epi16(top);
+  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array,
+                                       const int height, __m128i* weights) {
+  const __m128i inverter = _mm_set1_epi16(256);
+
+  if (height == 4) {
+    const __m128i weight = Load4(weight_array);
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = LoadLo8(weight_array + 4);
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+  } else {
+    const __m128i weight = LoadUnaligned16(weight_array + 12);
+    const __m128i zero = _mm_setzero_si128();
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(inverter, weights[2]);
+  }
+}
+
+inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
+                                   const int height, uint8_t* dst,
+                                   const ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32(128);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+  __m128i y_select = _mm_set1_epi16(0x0100);
+
+  for (int y = 0; y < height; ++y) {
+    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+    const __m128i alternate_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+    // The madd instruction yields four results of the form:
+    // (top_row[x] * weight[y] + corner * inverted_weight[y])
+    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, 8);
+    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+    Store4(dst, sum);
+    dst += stride;
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+                              const void* const top_row,
+                              const void* const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 4, &pixels);
+
+  __m128i weights[2];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
+}
+
+void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                              const void* const top_row,
+                              const void* const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 8, &pixels);
+
+  __m128i weights[2];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+}
+
+void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                               const void* const top_row,
+                               const void* const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 16, &pixels);
+
+  __m128i weights[4];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
+}
+
+void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+                              const void* const top_row,
+                              const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+  const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+}
+
+void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                              const void* const top_row,
+                              const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                               const void* const top_row,
+                               const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                               const void* const top_row,
+                               const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+                               const void* const top_row,
+                               const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+  const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+}
+
+void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                               const void* const top_row,
+                               const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+  const __m128i scaled_bottom_left_lo =
+      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+  const __m128i scaled_bottom_left_hi =
+      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+                               const void* const top_row,
+                               const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+                                const void* const top_row,
+                                const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      SmoothWxH<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      SmoothWxH<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      SmoothWxH<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      SmoothWxH<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      SmoothWxH<16, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      SmoothWxH<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      SmoothWxH<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      SmoothWxH<32, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      SmoothWxH<32, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      SmoothWxH<64, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      SmoothWxH<64, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      SmoothWxH<64, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x64_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
new file mode 100644
index 0000000..9938dfe
--- /dev/null
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -0,0 +1,3535 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>  // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Utility Functions
+
+// This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
+// Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
+// block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
+// we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
+// bits.
+constexpr int kThreeInverse = 0x5556;
+constexpr int kFiveInverse = 0x3334;
+template <int shiftk, int multiplier>
+inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
+  const __m128i interm = _mm_srli_epi32(dividend, shiftk);
+  return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
+}
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1
+
+using DcSumFunc = __m128i (*)(const void* ref);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
+using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
+                                    const __m128i column);
+// For copying an entire column across a block.
+using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
+                                 const void* column);
+
+// DC intra-predictors for non-square blocks.
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+struct DcPredFuncs_SSE4_1 {
+  DcPredFuncs_SSE4_1() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+};
+
+// Directional intra-predictors for square blocks.
+template <ColumnStoreFunc col_storefn>
+struct DirectionalPredFuncs_SSE4_1 {
+  DirectionalPredFuncs_SSE4_1() = delete;
+
+  static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+                       const void* left_column);
+  static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+                         const void* left_column);
+};
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+                        shiftk, dc_mult>::DcTop(void* const dest,
+                                                ptrdiff_t stride,
+                                                const void* const top_row,
+                                                const void* /*left_column*/) {
+  const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
+  const __m128i sum = top_sumfn(top_row);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+                        shiftk,
+                        dc_mult>::DcLeft(void* const dest, ptrdiff_t stride,
+                                         const void* /*top_row*/,
+                                         const void* const left_column) {
+  const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
+  const __m128i sum = left_sumfn(left_column);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+                        shiftk, dc_mult>::Dc(void* const dest, ptrdiff_t stride,
+                                             const void* const top_row,
+                                             const void* const left_column) {
+  const __m128i rounder =
+      _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
+  const __m128i sum_top = top_sumfn(top_row);
+  const __m128i sum_left = left_sumfn(left_column);
+  const __m128i sum = _mm_add_epi32(sum_top, sum_left);
+  if (width_log2 == height_log2) {
+    const __m128i dc =
+        _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
+    storefn(dest, stride, dc);
+  } else {
+    const __m128i dc =
+        DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
+    storefn(dest, stride, dc);
+  }
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1 directional predictors
+
+template <ColumnStoreFunc col_storefn>
+void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
+    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+    const void* const left_column) {
+  col_storefn(dest, stride, left_column);
+}
+
+}  // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// |ref| points to 4 bytes containing 4 packed ints.
+inline __m128i DcSum4_SSE4_1(const void* const ref) {
+  const __m128i vals = Load4(ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum8_SSE4_1(const void* const ref) {
+  const __m128i vals = LoadLo8(ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum16_SSE4_1(const void* const ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals = LoadUnaligned16(ref);
+  const __m128i partial_sum = _mm_sad_epu8(vals, zero);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum32_SSE4_1(const void* const ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals1 = LoadUnaligned16(ref);
+  const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
+  const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+  const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+  const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum64_SSE4_1(const void* const ref) {
+  const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals1 = LoadUnaligned16(ref_ptr);
+  const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
+  const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
+  const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
+  const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+  const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+  __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+  const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
+  partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
+  const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
+  partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    Store4(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  Store4(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreLo8(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreLo8(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    StoreUnaligned16(dst + 16, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+  StoreUnaligned16(dst + 16, dc_dup);
+}
+
+template <int height>
+inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    StoreUnaligned16(dst + 16, dc_dup);
+    StoreUnaligned16(dst + 32, dc_dup);
+    StoreUnaligned16(dst + 48, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+  StoreUnaligned16(dst + 16, dc_dup);
+  StoreUnaligned16(dst + 32, dc_dup);
+  StoreUnaligned16(dst + 48, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
+// be copied for width N into dest.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  Store4(dst, dup32);
+  dst += stride;
+  const int row1 = _mm_extract_epi32(dup32, 1);
+  memcpy(dst, &row1, 4);
+  dst += stride;
+  const int row2 = _mm_extract_epi32(dup32, 2);
+  memcpy(dst, &row2, 4);
+  dst += stride;
+  const int row3 = _mm_extract_epi32(dup32, 3);
+  memcpy(dst, &row3, 4);
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+  auto* dst = static_cast<uint8_t*>(dest);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+  dst += stride;
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
+                             const void* const column) {
+  const __m128i col_data = Load4(column);
+  const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
+  writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
+                             const void* const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i col_data = LoadLo8(column);
+  const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lo);
+  dst += stride4;
+  const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
+  writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const void* const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
+  const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+  const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lolo);
+  dst += stride4;
+  const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+  writefn(dst, stride, col_dup32_lohi);
+  dst += stride4;
+  const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+  writefn(dst, stride, col_dup32_hilo);
+  dst += stride4;
+  const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+  writefn(dst, stride, col_dup32_hihi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const void* const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 32; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+    const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+    const __m128i col_dup32_lolo =
+        _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lolo);
+    dst += stride4;
+    const __m128i col_dup32_lohi =
+        _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lohi);
+    dst += stride4;
+    const __m128i col_dup32_hilo =
+        _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hilo);
+    dst += stride4;
+    const __m128i col_dup32_hihi =
+        _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hihi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const void* const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 64; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+    const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+    const __m128i col_dup32_lolo =
+        _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lolo);
+    dst += stride4;
+    const __m128i col_dup32_lohi =
+        _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lohi);
+    dst += stride4;
+    const __m128i col_dup32_hilo =
+        _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hilo);
+    dst += stride4;
+    const __m128i col_dup32_hihi =
+        _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hihi);
+    dst += stride4;
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore4xH_SSE4_1<4>, 0, 0>;
+  // shiftk is the smaller of width_log2 and height_log2.
+  // dc_mult corresponds to the ratio of the smaller block size to the larger.
+  using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
+                                  DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
+  using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
+                                   DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
+
+  using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
+  using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
+                                  DcStore8xH_SSE4_1<8>, 0, 0>;
+  using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
+                                   DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
+  using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
+                                   DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
+
+  using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
+                                   DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
+  using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
+                                   DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
+  using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore16xH_SSE4_1<16>, 0, 0>;
+  using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
+  using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
+
+  using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
+                                   DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
+  using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
+  using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore32xH_SSE4_1<32>, 0, 0>;
+  using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
+
+  using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
+  using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
+  using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore64xH_SSE4_1<64>, 0, 0>;
+};
+
+struct DirDefs {
+  DirDefs() = delete;
+
+  using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+  using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+  using _4x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+  using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+  using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+  using _8x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+  using _8x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+  using _16x4 =
+      DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+  using _16x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+  using _16x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+  using _16x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+  using _16x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+  using _32x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+  using _32x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+  using _32x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+  using _32x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+  using _64x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+  using _64x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+  using _64x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+template <int y_mask>
+inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
+                            const __m128i& left, const __m128i& top_lefts,
+                            const __m128i& top_dists, const __m128i& left_dists,
+                            const __m128i& top_left_diffs) {
+  const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
+
+  const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i top_left_dists =
+      _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
+  not_select_left =
+      _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
+  const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
+
+  const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+  // The sequence of 32-bit packed operations was found (see CL via blame) to
+  // outperform 16-bit operations, despite the availability of the packus
+  // function, when tested on a Xeon E7 v3.
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  const __m128i pred = _mm_shuffle_epi8(
+      _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
+  Store4(dst, pred);
+}
+
+// top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
+// we would be able to do all of these operations as epi8 for a 16-pixel version
+// of this function. Still, since lefts_y is just a vector of duplicates, it
+// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
+// for the blends.
+template <int y_mask>
+inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
+                            const __m128i& left, const __m128i& top_lefts,
+                            const __m128i& top_dists, const __m128i& left_dists,
+                            const __m128i& top_left_diffs) {
+  const __m128i select_y = _mm_set1_epi32(y_mask);
+  const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
+
+  const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
+  const __m128i top_left_dists =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
+  not_select_left =
+      _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
+  const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
+
+  const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+  const __m128i pred = _mm_packus_epi16(
+      _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+// |top| is an epi8 of length 16
+// |left| is epi8 of unknown length, as y_mask specifies access
+// |top_lefts| is an epi8 of 16 duplicates
+// |top_dists| is an epi8 of unknown length, as y_mask specifies access
+// |left_dists| is an epi8 of length 16
+// |left_dists_lo| is an epi16 of length 8
+// |left_dists_hi| is an epi16 of length 8
+// |top_left_diffs_lo| is an epi16 of length 8
+// |top_left_diffs_hi| is an epi16 of length 8
+// The latter two vectors are epi16 because their values may reach -510.
+// |left_dists| is provided alongside its spread out version because it doesn't
+// change between calls and interacts with both kinds of packing.
+template <int y_mask>
+inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
+                             const __m128i& left, const __m128i& top_lefts,
+                             const __m128i& top_dists,
+                             const __m128i& left_dists,
+                             const __m128i& left_dists_lo,
+                             const __m128i& left_dists_hi,
+                             const __m128i& top_left_diffs_lo,
+                             const __m128i& top_left_diffs_hi) {
+  const __m128i select_y = _mm_set1_epi32(y_mask);
+  const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
+  const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
+  const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
+  const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
+
+  const __m128i top_left_dists_lo =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
+  const __m128i top_left_dists_hi =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
+
+  const __m128i left_gt_top_left_lo = _mm_packs_epi16(
+      _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
+  const __m128i left_gt_top_left_hi =
+      _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
+                      /* unused second arg for pack */ left_dists_hi);
+  const __m128i left_gt_top_left = _mm_alignr_epi8(
+      left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
+
+  const __m128i not_select_top_lo =
+      _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
+                      /* unused second arg for pack */ top_dists_y16);
+  const __m128i not_select_top_hi =
+      _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
+                      /* unused second arg for pack */ top_dists_y16);
+  const __m128i not_select_top = _mm_alignr_epi8(
+      not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
+
+  const __m128i left_leq_top =
+      _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
+  const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
+  const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
+
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
+                     const void* const top_row, const void* const left_column) {
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+}
+
+void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
+                     const void* const top_row, const void* const left_column) {
+  const __m128i left = LoadLo8(left_column);
+  const __m128i left_lo = _mm_cvtepu8_epi32(left);
+  const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
+  const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+}
+
+void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i left_0 = _mm_cvtepu8_epi32(left);
+  const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+  const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
+  const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
+
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
+  const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
+  const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
+  const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+}
+
+void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
+                     const void* const top_row, const void* const left_column) {
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+}
+
+void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
+                     const void* const top_row, const void* const left_column) {
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+}
+
+void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i left_lo = _mm_cvtepu8_epi16(left);
+  const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
+  const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+}
+
+void Paeth8x32_SSE4_1(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
+  Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
+}
+
+void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const left_column) {
+  const __m128i left = Load4(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+// Inlined for calling with offsets in larger transform sizes, mainly to
+// preserve top_left.
+inline void WritePaeth16x8(void* const dest, ptrdiff_t stride,
+                           const uint8_t top_left, const __m128i top,
+                           const __m128i left) {
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top_left,
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x8_SSE4_1(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const left_column) {
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i left = LoadLo8(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
+                     const __m128i top, const __m128i left) {
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x16_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const __m128i left_0 = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top, left_0);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
+}
+
+void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const ptrdiff_t stride16 = stride << 4;
+  const __m128i left_0 = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top, left_0);
+  dst += stride16;
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  WritePaeth16x16(dst, stride, top_left, top, left_1);
+  dst += stride16;
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  WritePaeth16x16(dst, stride, top_left, top, left_2);
+  dst += stride16;
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  WritePaeth16x16(dst, stride, top_left, top, left_3);
+}
+
+void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const left_column) {
+  const __m128i left = LoadLo8(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x8(dst, stride, top_left, top_0, left);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+}
+
+void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+}
+
+void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
+}
+
+void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+}
+
+void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
+}
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+                                    const uint8_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    return;
+  }
+  int y = 0;
+  do {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    dst += stride;
+    memcpy(dst, top + offset + 4, width);
+    dst += stride;
+    memcpy(dst, top + offset + 5, width);
+    dst += stride;
+    memcpy(dst, top + offset + 6, width);
+    dst += stride;
+    memcpy(dst, top + offset + 7, width);
+    dst += stride;
+
+    offset += 8;
+    y += 8;
+  } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+                                 const uint8_t* const top, const int height,
+                                 const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const int rounding_bits = 5;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+  const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+                                    : _mm_set_epi64x(0, 0x0403030202010100);
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  int y = 0;
+  int top_x = xstep;
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i max_shift = _mm_set1_epi8(32);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+    const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadLo8(top + top_base_x);
+    const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+    prod = RightShiftWithRounding_U16(prod, rounding_bits);
+    // Replace pixels from invalid range with top-right corner.
+    prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+    Store4(dst, _mm_packus_epi16(prod, prod));
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+                                   const uint8_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    // Corner-only section of the row.
+    memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const int width, const int height,
+                                    const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled);
+    return;
+  }
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = 0;
+    do {
+      int top_base_x = top_x >> scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi8(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+        __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+        vals = _mm_maddubs_epi16(vals, shifts);
+        vals = RightShiftWithRounding_U16(vals, rounding_bits);
+        StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (++y < height);
+    return;
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dest, top_row[max_base_x], width);
+        dest += stride;
+      }
+      return;
+    }
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    for (; x < width - 8;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(top_row + top_base_x);
+    } else {
+      const __m128i top_vals = LoadLo8(top_row + top_base_x);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+    StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    dest += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const int width, const int height,
+                                           const int xstep,
+                                           const bool upsampled_top) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+                          upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const int rounding_bits = 5;
+
+  __m128i result_block[4];
+  for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadLo8(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadLo8(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    result_block[x] = _mm_packus_epi16(vals, vals);
+  }
+  const __m128i result = Transpose4x4_U8(result_block);
+  // This is result_row0.
+  Store4(dest, result);
+  dest += stride;
+  const int result_row1 = _mm_extract_epi32(result, 1);
+  memcpy(dest, &result_row1, sizeof(result_row1));
+  dest += stride;
+  const int result_row2 = _mm_extract_epi32(result, 2);
+  memcpy(dest, &result_row2, sizeof(result_row2));
+  dest += stride;
+  const int result_row3 = _mm_extract_epi32(result, 3);
+  memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler =
+      _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const int rounding_bits = 5;
+
+  __m128i result_block[8];
+  for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+  }
+  Transpose8x8_U16(result_block, result_block);
+  for (int y = 0; y < height; ++y) {
+    StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int ystep,
+                                           const bool upsampled) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (width == 4 || height == 4) {
+    const ptrdiff_t stride4 = stride << 2;
+    if (upsampled) {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<true>(
+              dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    } else {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+                                      ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    }
+    return;
+  }
+
+  const ptrdiff_t stride8 = stride << 3;
+  if (upsampled) {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<true, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  } else {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<false, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds) {
+  const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds,
+                                     const __m128i& bounds_selector) {
+  const __m128i max_dest_x_vect =
+      _mm_shuffle_epi8(zone_bounds, bounds_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+                                                 const __m128i& shifts,
+                                                 const __m128i& sampler) {
+  const __m128i src_vals = LoadUnaligned16(source);
+  __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+  vals = _mm_maddubs_epi16(vals, shifts);
+  return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+  const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+  // Left_column and sampler are both offset by 15 so the indices are always
+  // positive.
+  const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+  for (int y = 0; y < 4; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+    // can work as shuffle indices. Some values may be out of bounds, but their
+    // pred results will be masked over by top prediction.
+    sampler = _mm_add_epi8(sampler, positive_offset);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column + (y << upsample_shift), shifts, sampler);
+    Store4(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_set1_epi8(1);
+  const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+  for (int y = 0; y < 8; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+    // Offset the relative index because ystep is negative in Zone 2 and shuffle
+    // indices must be nonnegative.
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    sampler = _mm_add_epi8(sampler, denegation);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+    // The specification adds (y << 6) to left_y, which is subject to
+    // upsampling, but this puts sampler indices out of the 0-15 range. It is
+    // equivalent to offset the source address by (y << upsample_shift) instead.
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+        sampler);
+    StoreLo8(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+  top_x -= xstep;
+
+  int top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+  DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+  DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+  DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+  DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  __m128i y_selector = _mm_set1_epi32(0x01000100);
+  const __m128i index_increment = _mm_set1_epi32(0x02020202);
+  for (int y = 0; y < height; ++y,
+           y_selector = _mm_add_epi8(y_selector, index_increment),
+           dest += stride) {
+    top_x -= xstep;
+    const int top_base_x = top_x >> scale_bits_x;
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+    DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+  }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const uint8_t* const left_column,
+                                    const int width, const int height,
+                                    const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride8 = stride << 3;
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute. This assumes minimum |xstep| is 3.
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // For steep angles, the source pixels from left_column may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  const int max_shuffle_height =
+      std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
+
+  const int xstep8 = xstep << 3;
+  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+  // Accumulate xstep across 8 rows.
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+  const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep8 = ystep << 3;
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+
+  const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+  int x = 0;
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  for (int left_offset = -left_base_increment; x < min_top_only_x;
+       x += 8,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+           // Watch left_y because it can still get big.
+       left_y = _mm_add_epi16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    DirectionalZone1_4xH(dst_x + 4, stride,
+                         top_row + ((x + 4) << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst_x += stride8,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+         top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_8xH<upsampled_top, 8>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Pick up from the last y-value, using the 10% slower but secure method for
+    // left prediction.
+    const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+    for (; y < min_left_only_y;
+         y += 8, dst_x += stride8,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+         top_x -= xstep8) {
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      DirectionalZone1Blend_8xH<upsampled_top, 8>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst_x += stride8) {
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+    }
+  }
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+                                      const uint8_t* const top_row,
+                                      const uint8_t* const left_column,
+                                      const int width, const int height,
+                                      const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  const int xstep4 = xstep << 2;
+  const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+  __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep4 = ystep << 2;
+  const int left_base_increment4 = ystep4 >> 6;
+  // This is guaranteed to be less than 64, but accumulation may bring it past
+  // 64 for higher x values.
+  const int ystep_remainder4 = ystep4 & 0x3F;
+  const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+  const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which will go into the left_column offset.
+  // Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+  int x = 0;
+  // Loop over x for columns with a mixture of sources.
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+           left_y = _mm_add_epi16(left_y, increment_left4),
+           left_offset -= left_base_increment4) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute. Rounded up to the nearest multiple of 4.
+    const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    // Loop over y for mixed rows.
+    for (; y < min_left_only_y;
+         y += 4, dst_x += stride4,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+         top_x -= xstep4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+          left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_4x4<upsampled_top>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left-only rows, if any.
+    for (; y < height; y += 4, dst_x += stride4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+    }
+  }
+  // Loop over top-only columns, if any.
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int xstep, const int ystep,
+                                           const bool upsampled_top,
+                                           const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint8_t top_buffer[288];
+  uint8_t left_buffer[288];
+  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+  const uint8_t* top_ptr = top_buffer + 144;
+  const uint8_t* left_ptr = left_buffer + 144;
+  if (width == 4 || height == 4) {
+    if (upsampled_left) {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                              width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      }
+    } else {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                                width, height, xstep, ystep);
+      }
+    }
+    return;
+  }
+  if (upsampled_left) {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                          width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    }
+  } else {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                            width, height, xstep, ystep);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
+                             const __m128i& pixels, const __m128i& taps_0_1,
+                             const __m128i& taps_2_3, const __m128i& taps_4_5,
+                             const __m128i& taps_6_7) {
+  const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+  const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+  // |output_half| contains 8 partial sums.
+  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+  __m128i output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row0 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* arbitrary pack arg */ output);
+  Store4(dst, output_row0);
+  const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+  const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+  output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row1 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* arbitrary pack arg */ output);
+  Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. This implementation
+// loads TL from the top row for the first block, so it is not
+inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
+                      const uint8_t* const top_ptr,
+                      const uint8_t* const left_ptr, FilterIntraPredictor pred,
+                      const int height) {
+  const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
+  __m128i top = Load4(top_ptr - 1);
+  __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+  __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+  left = _mm_slli_si128(left, 5);
+
+  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+  // left[2], left[3], left[4], left[5], left[6], left[7]
+  pixels = _mm_or_si128(left, pixels);
+
+  // Duplicate first 8 bytes.
+  pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 1.
+  pixels = Load4(dest);
+
+  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+  // left[0], left[1], ...
+  pixels = _mm_or_si128(left, pixels);
+
+  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+  // byte is an unused value, which shall be multiplied by 0 when we apply the
+  // filter.
+  constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  dest += stride;  // Move to y = 2.
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 3.
+
+  // Compute the middle 8 rows before using common code for the final 4 rows.
+  // Because the common code below this block assumes that
+  if (height == 16) {
+    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+    left = _mm_slli_si128(left, 1);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+    pixels = _mm_or_si128(left, pixels);
+
+    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+    // last byte is an unused value, as above. The top-left was shifted to
+    // position nine to keep two empty spaces after the top pixels.
+    constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+    // the end.
+    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 4.
+
+    // First 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Clear all but final pixel in the first 8 of left column.
+    __m128i keep_top_left = _mm_srli_si128(left, 13);
+    dest += stride;  // Move to y = 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+    pixels = _mm_or_si128(left, pixels);
+    left = LoadLo8(left_ptr + 8);
+
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 6.
+
+    // Second 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Position TL value so we can use pixel_order1.
+    keep_top_left = _mm_slli_si128(keep_top_left, 6);
+    dest += stride;  // Move to y = 7.
+    pixels = Load4(dest);
+    left = _mm_slli_si128(left, 7);
+    left = _mm_or_si128(left, keep_top_left);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 8.
+
+    // Third 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 9.
+
+    // Prepare final inputs.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 10.
+
+    // Fourth 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 11.
+  }
+
+  // In both the 8 and 16 case, we assume that the left vector has the next TL
+  // at position 8.
+  if (height > 4) {
+    // Erase prior left pixels by shifting TL to position 0.
+    left = _mm_srli_si128(left, 8);
+    left = _mm_slli_si128(left, 6);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 12 or 4.
+
+    // First of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 13 or 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 14 or 6.
+
+    // Last of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+  }
+}
+
+void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column,
+                                 FilterIntraPredictor pred, const int width,
+                                 const int height) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (width == 4) {
+    Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+    return;
+  }
+
+  // There is one set of 7 taps for each of the 4x2 output pixels.
+  const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
+
+  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+  // the end is an unused value, which shall be multiplied by 0 when we apply
+  // the filter.
+  constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+  // Takes the "left section" and puts it right after p0-p4.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+  // byte is unused as above.
+  constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+  // Shuffles the "top left" from the left section, to the front. Used when
+  // grabbing data from left_column and not top_row.
+  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+  // This first pass takes care of the cases where the top left pixel comes from
+  // top_row.
+  __m128i pixels = LoadLo8(top_ptr - 1);
+  __m128i left = _mm_slli_si128(Load4(left_column), 8);
+  pixels = _mm_or_si128(pixels, left);
+
+  // Two sets of the same pixels to multiply with two sets of taps.
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+  left = _mm_srli_si128(left, 1);
+
+  // Load
+  pixels = Load4(dst + stride);
+
+  // Because of the above shift, this OR 'invades' the final of the first 8
+  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+  // a padded 0.
+  pixels = _mm_or_si128(pixels, left);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+  const ptrdiff_t stride2 = stride << 1;
+  const ptrdiff_t stride4 = stride << 2;
+  Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dst += 4;
+  for (int x = 3; x < width - 4; x += 4) {
+    pixels = Load4(top_ptr + x);
+    pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+    pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    pixels = Load4(dst + stride - 1);
+    pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+    pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+    dst += 4;
+  }
+
+  // Now we handle heights that reference previous blocks rather than top_row.
+  for (int y = 4; y < height; y += 4) {
+    // Leftmost 4x4 block for this height.
+    dst -= width;
+    dst += stride4;
+
+    // Top Left is not available by offset in these leftmost blocks.
+    pixels = Load4(dst - stride);
+    left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+    left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+    left = _mm_srli_si128(left, 2);
+    pixels = Load4(dst + stride);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+
+    dst += 4;
+
+    // Remaining 4x4 blocks for this height.
+    for (int x = 4; x < width; x += 4) {
+      pixels = Load4(dst - stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                       taps_6_7);
+      pixels = Load4(dst + stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                       taps_4_5, taps_6_7);
+      dst += 4;
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+  dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Paeth16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Paeth16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Paeth16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Paeth16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Paeth16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Paeth32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Paeth32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Paeth32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Paeth32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Paeth64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Paeth64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Paeth64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DirDefs::_64x64::Horizontal;
+#endif
+}  // NOLINT(readability/fn_size)
+// TODO(petersonab): Split Init8bpp function into family-specific files.
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreLo8(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreLo8(dst, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
+// identical shorts that need N total copies written into dest. The unpacking
+// works the same as in the 8bpp case, except that each 32-bit unit needs twice
+// as many copies.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  auto* dst = static_cast<uint8_t*>(dest);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+  dst += stride;
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
+  }
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
+  }
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
+  }
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
+  }
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding row in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
+                             const void* const column) {
+  const __m128i col_data = LoadLo8(column);
+  const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
+  writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
+                             const void* const column) {
+  const __m128i col_data = LoadUnaligned16(column);
+  const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+  const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lo);
+  const ptrdiff_t stride4 = stride << 2;
+  dst += stride4;
+  writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const void* const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 32; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const void* const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 64; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const void* const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 128; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+// |ref| points to 8 bytes containing 4 packed int16 values.
+inline __m128i DcSum4_SSE4_1(const void* ref) {
+  const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
+  const __m128i ones = _mm_set1_epi16(1);
+
+  // half_sum[31:0]  = a1+a2
+  // half_sum[63:32] = a3+a4
+  const __m128i half_sum = _mm_madd_epi16(vals, ones);
+  // Place half_sum[63:32] in shift_sum[31:0].
+  const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
+  return _mm_add_epi32(half_sum, shift_sum);
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore4xH_SSE4_1<4>, 0, 0>;
+};
+
+struct DirDefs {
+  DirDefs() = delete;
+
+  using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+  using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+  using _4x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+  using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+  using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+  using _8x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+  using _8x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+  using _16x4 =
+      DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+  using _16x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+  using _16x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+  using _16x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+  using _16x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+  using _32x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+  using _32x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+  using _32x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+  using _32x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+  using _64x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+  using _64x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+  using _64x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DirDefs::_64x64::Horizontal;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h
new file mode 100644
index 0000000..7f4fcd7
--- /dev/null
+++ b/src/dsp/x86/intrapred_sse4.h
@@ -0,0 +1,1060 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor, see the defines below for specifics. These
+// functions are not thread-safe.
+void IntraPredInit_SSE4_1();
+void IntraPredCflInit_SSE4_1();
+void IntraPredSmoothInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
new file mode 100644
index 0000000..787d706
--- /dev/null
+++ b/src/dsp/x86/inverse_transform_sse4.cc
@@ -0,0 +1,3086 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
+                                    const __m128i* s) {
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (store_width == 16) {
+    for (int i = 0; i < store_count; i += 4) {
+      StoreUnaligned16(&dst[i * stride + idx], s[i]);
+      StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
+      StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
+      StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
+    }
+  }
+  if (store_width == 8) {
+    for (int i = 0; i < store_count; i += 4) {
+      StoreLo8(&dst[i * stride + idx], s[i]);
+      StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
+      StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
+      StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
+    }
+  }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
+                                   int32_t idx, __m128i* x) {
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (load_width == 16) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = LoadUnaligned16(&src[i * stride + idx]);
+      x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
+      x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
+      x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
+    }
+  }
+  if (load_width == 8) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = LoadLo8(&src[i * stride + idx]);
+      x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
+      x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
+      x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
+    }
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i psin_pcos = _mm_set1_epi32(
+      static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+  const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+  const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+  const __m128i sign =
+      _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+  // -sin cos, -sin cos, -sin cos, -sin cos
+  const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+  const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+  const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+  const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+  const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+  const __m128i x = _mm_packs_epi32(x1, x1);
+  const __m128i y = _mm_packs_epi32(y1, y1);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i psin_pcos = _mm_set1_epi32(
+      static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+  const __m128i sign =
+      _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+  // -sin cos, -sin cos, -sin cos, -sin cos
+  const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+  const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+  const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+  const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
+  const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
+  const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+  const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+  const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
+  const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
+  const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+  const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+  const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
+  const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
+  const __m128i x = _mm_packs_epi32(x1, x1_hi);
+  const __m128i y = _mm_packs_epi32(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+  const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
+  const __m128i x = _mm_mulhrs_epi16(*b, psin);
+  const __m128i y = _mm_mulhrs_epi16(*b, pcos);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
+                                                          __m128i* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+  const __m128i psin = _mm_set1_epi16(sin128 << 3);
+  const __m128i x = _mm_mulhrs_epi16(*a, pcos);
+  const __m128i y = _mm_mulhrs_epi16(*a, psin);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
+  __m128i x, y;
+  if (flip) {
+    y = _mm_adds_epi16(*b, *a);
+    x = _mm_subs_epi16(*b, *a);
+  } else {
+    x = _mm_adds_epi16(*a, *b);
+    y = _mm_subs_epi16(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
+                                       bool flip);
+
+LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
+                                            const __m128i v_row_shift_add,
+                                            const __m128i v_row_shift) {
+  const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
+  // The max row_shift is 2, so int16_t values greater than 0x7ffd may
+  // overflow.  Generate a mask for this case.
+  const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
+  const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
+  // Assume int16_t values.
+  const __m128i a = _mm_sra_epi16(x, v_row_shift);
+  // Assume uint16_t values.
+  const __m128i b = _mm_srl_epi16(x, v_row_shift);
+  // Select the correct shifted value.
+  return _mm_blendv_epi8(a, b, mask);
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_src =
+      (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
+  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+  const int16_t cos128 = Cos128(32);
+  const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
+
+  // Expand to 32 bits to prevent int16_t overflows during the shift add.
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_cvtepi16_epi32(xy);
+  const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
+  const __m128i b = _mm_add_epi32(a, v_row_shift_add);
+  const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
+  const __m128i c = _mm_sra_epi32(b, v_row_shift);
+  const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
+  const __m128i xy_shifted = _mm_packs_epi32(c, c1);
+
+  if (width == 4) {
+    StoreLo8(dst, xy_shifted);
+  } else {
+    for (int i = 0; i < width; i += 8) {
+      StoreUnaligned16(dst, xy_shifted);
+      dst += 8;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const __m128i v_src = LoadLo8(dst);
+    const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+    StoreLo8(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadUnaligned16(&dst[i]);
+      const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+      StoreUnaligned16(&dst[i], xy);
+      i += 8;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  HadamardRotation(&s[0], &s[3], false);
+  HadamardRotation(&s[1], &s[2], false);
+}
+
+// Process 4 dct4 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
+                                       bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[4], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4_U16(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+  }
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x4To4x8_U16(s, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4_U16(s, s);
+    }
+    StoreDst<8, 4>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false);
+  HadamardRotation(&s[6], &s[7], true);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  HadamardRotation(&s[0], &s[7], false);
+  HadamardRotation(&s[1], &s[6], false);
+  HadamardRotation(&s[2], &s[5], false);
+  HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
+                                       bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, 0, input);
+      Transpose8x8_U16(input, x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x8_U16(s, output);
+      StoreDst<16, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, s);
+    }
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false);
+  HadamardRotation(&s[10], &s[11], true);
+  HadamardRotation(&s[12], &s[13], false);
+  HadamardRotation(&s[14], &s[15], true);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false);
+  HadamardRotation(&s[9], &s[10], false);
+  HadamardRotation(&s[12], &s[15], true);
+  HadamardRotation(&s[13], &s[14], true);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  HadamardRotation(&s[0], &s[15], false);
+  HadamardRotation(&s[1], &s[14], false);
+  HadamardRotation(&s[2], &s[13], false);
+  HadamardRotation(&s[3], &s[12], false);
+  HadamardRotation(&s[4], &s[11], false);
+  HadamardRotation(&s[5], &s[10], false);
+  HadamardRotation(&s[6], &s[9], false);
+  HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8_U16(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i input[8];
+        LoadSrc<16, 8>(dst, step, idx, input);
+        Transpose8x8_U16(input, &x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+  Dct16Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4_U16(&s[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i output[8];
+        Transpose8x8_U16(&s[idx], output);
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, s);
+    }
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false);
+  HadamardRotation(&s[18], &s[19], true);
+  HadamardRotation(&s[20], &s[21], false);
+  HadamardRotation(&s[22], &s[23], true);
+  HadamardRotation(&s[24], &s[25], false);
+  HadamardRotation(&s[26], &s[27], true);
+  HadamardRotation(&s[28], &s[29], false);
+  HadamardRotation(&s[30], &s[31], true);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false);
+  HadamardRotation(&s[17], &s[18], false);
+  HadamardRotation(&s[20], &s[23], true);
+  HadamardRotation(&s[21], &s[22], true);
+  HadamardRotation(&s[24], &s[27], false);
+  HadamardRotation(&s[25], &s[26], false);
+  HadamardRotation(&s[28], &s[31], true);
+  HadamardRotation(&s[29], &s[30], true);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false);
+  HadamardRotation(&s[17], &s[22], false);
+  HadamardRotation(&s[18], &s[21], false);
+  HadamardRotation(&s[19], &s[20], false);
+  HadamardRotation(&s[24], &s[31], true);
+  HadamardRotation(&s[25], &s[30], true);
+  HadamardRotation(&s[26], &s[29], true);
+  HadamardRotation(&s[27], &s[28], true);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  HadamardRotation(&s[0], &s[31], false);
+  HadamardRotation(&s[1], &s[30], false);
+  HadamardRotation(&s[2], &s[29], false);
+  HadamardRotation(&s[3], &s[28], false);
+  HadamardRotation(&s[4], &s[27], false);
+  HadamardRotation(&s[5], &s[26], false);
+  HadamardRotation(&s[6], &s[25], false);
+  HadamardRotation(&s[7], &s[24], false);
+  HadamardRotation(&s[8], &s[23], false);
+  HadamardRotation(&s[9], &s[22], false);
+  HadamardRotation(&s[10], &s[21], false);
+  HadamardRotation(&s[11], &s[20], false);
+  HadamardRotation(&s[12], &s[19], false);
+  HadamardRotation(&s[13], &s[18], false);
+  HadamardRotation(&s[14], &s[17], false);
+  HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
+                                        const bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[32], x[32];
+
+  if (transpose) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, idx, input);
+      Transpose8x8_U16(input, &x[idx]);
+    }
+  } else {
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_8>(s);
+  Dct8Stages<ButterflyRotation_8>(s);
+  Dct16Stages<ButterflyRotation_8>(s);
+  Dct32Stages<ButterflyRotation_8>(s);
+
+  if (transpose) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i output[8];
+      Transpose8x8_U16(&s[idx], output);
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 32>(dst, step, 0, s);
+  }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[64], x[32];
+
+  if (transpose) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, idx, input);
+      Transpose8x8_U16(input, &x[idx]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false);
+  HadamardRotation(&s[34], &s[35], true);
+  HadamardRotation(&s[36], &s[37], false);
+  HadamardRotation(&s[38], &s[39], true);
+  HadamardRotation(&s[40], &s[41], false);
+  HadamardRotation(&s[42], &s[43], true);
+  HadamardRotation(&s[44], &s[45], false);
+  HadamardRotation(&s[46], &s[47], true);
+  HadamardRotation(&s[48], &s[49], false);
+  HadamardRotation(&s[50], &s[51], true);
+  HadamardRotation(&s[52], &s[53], false);
+  HadamardRotation(&s[54], &s[55], true);
+  HadamardRotation(&s[56], &s[57], false);
+  HadamardRotation(&s[58], &s[59], true);
+  HadamardRotation(&s[60], &s[61], false);
+  HadamardRotation(&s[62], &s[63], true);
+
+  // stage 7.
+  ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false);
+  HadamardRotation(&s[33], &s[34], false);
+  HadamardRotation(&s[36], &s[39], true);
+  HadamardRotation(&s[37], &s[38], true);
+  HadamardRotation(&s[40], &s[43], false);
+  HadamardRotation(&s[41], &s[42], false);
+  HadamardRotation(&s[44], &s[47], true);
+  HadamardRotation(&s[45], &s[46], true);
+  HadamardRotation(&s[48], &s[51], false);
+  HadamardRotation(&s[49], &s[50], false);
+  HadamardRotation(&s[52], &s[55], true);
+  HadamardRotation(&s[53], &s[54], true);
+  HadamardRotation(&s[56], &s[59], false);
+  HadamardRotation(&s[57], &s[58], false);
+  HadamardRotation(&s[60], &s[63], true);
+  HadamardRotation(&s[61], &s[62], true);
+
+  // stage 16.
+  ButterflyRotation_8(&s[61], &s[34], 56, true);
+  ButterflyRotation_8(&s[60], &s[35], 56, true);
+  ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false);
+  HadamardRotation(&s[33], &s[38], false);
+  HadamardRotation(&s[34], &s[37], false);
+  HadamardRotation(&s[35], &s[36], false);
+  HadamardRotation(&s[40], &s[47], true);
+  HadamardRotation(&s[41], &s[46], true);
+  HadamardRotation(&s[42], &s[45], true);
+  HadamardRotation(&s[43], &s[44], true);
+  HadamardRotation(&s[48], &s[55], false);
+  HadamardRotation(&s[49], &s[54], false);
+  HadamardRotation(&s[50], &s[53], false);
+  HadamardRotation(&s[51], &s[52], false);
+  HadamardRotation(&s[56], &s[63], true);
+  HadamardRotation(&s[57], &s[62], true);
+  HadamardRotation(&s[58], &s[61], true);
+  HadamardRotation(&s[59], &s[60], true);
+
+  // stage 25.
+  ButterflyRotation_8(&s[59], &s[36], 48, true);
+  ButterflyRotation_8(&s[58], &s[37], 48, true);
+  ButterflyRotation_8(&s[57], &s[38], 48, true);
+  ButterflyRotation_8(&s[56], &s[39], 48, true);
+  ButterflyRotation_8(&s[55], &s[40], 112, true);
+  ButterflyRotation_8(&s[54], &s[41], 112, true);
+  ButterflyRotation_8(&s[53], &s[42], 112, true);
+  ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false);
+  HadamardRotation(&s[33], &s[46], false);
+  HadamardRotation(&s[34], &s[45], false);
+  HadamardRotation(&s[35], &s[44], false);
+  HadamardRotation(&s[36], &s[43], false);
+  HadamardRotation(&s[37], &s[42], false);
+  HadamardRotation(&s[38], &s[41], false);
+  HadamardRotation(&s[39], &s[40], false);
+  HadamardRotation(&s[48], &s[63], true);
+  HadamardRotation(&s[49], &s[62], true);
+  HadamardRotation(&s[50], &s[61], true);
+  HadamardRotation(&s[51], &s[60], true);
+  HadamardRotation(&s[52], &s[59], true);
+  HadamardRotation(&s[53], &s[58], true);
+  HadamardRotation(&s[54], &s[57], true);
+  HadamardRotation(&s[55], &s[56], true);
+
+  // stage 30.
+  ButterflyRotation_8(&s[55], &s[40], 32, true);
+  ButterflyRotation_8(&s[54], &s[41], 32, true);
+  ButterflyRotation_8(&s[53], &s[42], 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 32, true);
+  ButterflyRotation_8(&s[50], &s[45], 32, true);
+  ButterflyRotation_8(&s[49], &s[46], 32, true);
+  ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+  }
+  //-- end dct 64 stages
+
+  if (transpose) {
+    for (int idx = 0; idx < 64; idx += 8) {
+      __m128i output[8];
+      Transpose8x8_U16(&s[idx], output);
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 64>(dst, step, 0, s);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4_U16(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+  }
+
+  const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
+  const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
+  const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
+  const __m128i kAdst4Multiplier_m0_1 =
+      _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
+                     (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
+  const __m128i kAdst4Multiplier_3_0 =
+      _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
+                     (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
+
+  // stage 1.
+  const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
+  const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
+  const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
+  const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
+  const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
+
+  s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
+  s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
+
+  // stage 2.
+  // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
+  const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
+  const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
+  const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
+
+  // stage 3.
+  s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
+  s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
+  s[2] = b7;
+  s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
+
+  // stage 4.
+  s[0] = _mm_add_epi32(s[0], s[5]);
+  s[1] = _mm_sub_epi32(s[1], s[6]);
+
+  // stages 5 and 6.
+  x[0] = _mm_add_epi32(s[0], s[3]);
+  x[1] = _mm_add_epi32(s[1], s[3]);
+  x[2] = _mm_add_epi32(s[0], s[1]);
+  x[3] = _mm_sub_epi32(x[2], s[3]);
+
+  x[0] = RightShiftWithRounding_S32(x[0], 12);
+  x[1] = RightShiftWithRounding_S32(x[1], 12);
+  x[2] = RightShiftWithRounding_S32(s[2], 12);
+  x[3] = RightShiftWithRounding_S32(x[3], 12);
+
+  x[0] = _mm_packs_epi32(x[0], x[1]);
+  x[2] = _mm_packs_epi32(x[2], x[3]);
+  x[1] = _mm_srli_si128(x[0], 8);
+  x[3] = _mm_srli_si128(x[2], 8);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x4To4x8_U16(x, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+    StoreDst<8, 4>(dst, step, 0, x);
+  }
+}
+
+constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
+                                               3344, 0, 2482, 1321};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src =
+      _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
+  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+  const __m128i v_kAdst4DcOnlyMultipliers =
+      LoadUnaligned16(kAdst4DcOnlyMultiplier);
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  // +
+  // s0*0  s0*0  s0*0  s0*k0
+  const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
+  const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  const __m128i c = _mm_packs_epi32(b, b);
+  StoreLo8(dst, c);
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
+    const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
+    const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
+    const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
+    const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
+    const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
+    const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
+    const __m128i x0 = s0;
+    const __m128i x1 = s1;
+    const __m128i x2 = s2;
+    const __m128i x3 = _mm_add_epi32(s0, s1);
+    const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
+    const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
+    const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
+    const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
+    const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
+    const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
+    StoreLo8(&dst[i], dst_0_1);
+    StoreHi8(&dst[i + width * 1], dst_0_1);
+    StoreLo8(&dst[i + width * 2], dst_2_3);
+    StoreHi8(&dst[i + width * 3], dst_2_3);
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, 0, input);
+      Transpose8x8_U16(input, x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[3], &s[7], false);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[4]);
+  x[2] = s[6];
+  x[3] = _mm_subs_epi16(v_zero, s[2]);
+  x[4] = s[3];
+  x[5] = _mm_subs_epi16(v_zero, s[7]);
+  x[6] = s[5];
+  x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x8_U16(x, output);
+      StoreDst<16, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[8];
+
+  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  // stage 1.
+  s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  __m128i x[8];
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[4]);
+  x[2] = s[6];
+  x[3] = _mm_subs_epi16(v_zero, s[2]);
+  x[4] = s[3];
+  x[5] = _mm_subs_epi16(v_zero, s[7]);
+  x[6] = s[5];
+  x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+  const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
+  const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
+  const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
+  const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
+  const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+  const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+  const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+  StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[8];
+
+  int i = 0;
+  do {
+    const __m128i v_src = LoadLo8(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    __m128i x[8];
+    const __m128i v_zero = _mm_setzero_si128();
+    x[0] = s[0];
+    x[1] = _mm_subs_epi16(v_zero, s[4]);
+    x[2] = s[6];
+    x[3] = _mm_subs_epi16(v_zero, s[2]);
+    x[4] = s[3];
+    x[5] = _mm_subs_epi16(v_zero, s[7]);
+    x[6] = s[5];
+    x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      StoreLo8(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
+                                         bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8_U16(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i input[8];
+        LoadSrc<16, 8>(dst, step, idx, input);
+        Transpose8x8_U16(input, &x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false);
+  HadamardRotation(&s[1], &s[9], false);
+  HadamardRotation(&s[2], &s[10], false);
+  HadamardRotation(&s[3], &s[11], false);
+  HadamardRotation(&s[4], &s[12], false);
+  HadamardRotation(&s[5], &s[13], false);
+  HadamardRotation(&s[6], &s[14], false);
+  HadamardRotation(&s[7], &s[15], false);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[8], &s[12], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[9], &s[13], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[10], &s[14], false);
+  HadamardRotation(&s[3], &s[7], false);
+  HadamardRotation(&s[11], &s[15], false);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[8], &s[10], false);
+  HadamardRotation(&s[12], &s[14], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+  HadamardRotation(&s[9], &s[11], false);
+  HadamardRotation(&s[13], &s[15], false);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[8]);
+  x[2] = s[12];
+  x[3] = _mm_subs_epi16(v_zero, s[4]);
+  x[4] = s[6];
+  x[5] = _mm_subs_epi16(v_zero, s[14]);
+  x[6] = s[10];
+  x[7] = _mm_subs_epi16(v_zero, s[2]);
+  x[8] = s[3];
+  x[9] = _mm_subs_epi16(v_zero, s[11]);
+  x[10] = s[15];
+  x[11] = _mm_subs_epi16(v_zero, s[7]);
+  x[12] = s[5];
+  x[13] = _mm_subs_epi16(v_zero, s[13]);
+  x[14] = s[9];
+  x[15] = _mm_subs_epi16(v_zero, s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4_U16(&x[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i output[8];
+        Transpose8x8_U16(&x[idx], output);
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[8]);
+  x[2] = s[12];
+  x[3] = _mm_subs_epi16(v_zero, s[4]);
+  x[4] = s[6];
+  x[5] = _mm_subs_epi16(v_zero, s[14]);
+  x[6] = s[10];
+  x[7] = _mm_subs_epi16(v_zero, s[2]);
+  x[8] = s[3];
+  x[9] = _mm_subs_epi16(v_zero, s[11]);
+  x[10] = s[15];
+  x[11] = _mm_subs_epi16(v_zero, s[7]);
+  x[12] = s[5];
+  x[13] = _mm_subs_epi16(v_zero, s[13]);
+  x[14] = s[9];
+  x[15] = _mm_subs_epi16(v_zero, s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[16];
+  __m128i x[16];
+
+  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  // stage 1.
+  s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 2; ++i) {
+    const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
+    const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
+    const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
+    const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
+    const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+    const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+    const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+    const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+    const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+    const __m128i a1 =
+        _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+    const __m128i b = _mm_sra_epi32(a, v_row_shift);
+    const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+    StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
+  }
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    __m128i s[16];
+    __m128i x[16];
+    const __m128i v_src = LoadUnaligned16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      StoreLo8(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  if (is_row_shift) {
+    const int shift = 1;
+    const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+    const __m128i v_multiplier_one =
+        _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+    for (int i = 0; i < 4; i += 2) {
+      const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+      const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+      const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
+      const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+      const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
+      const __m128i b = _mm_srai_epi32(a, 12 + shift);
+      const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
+      StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
+    }
+  } else {
+    const __m128i v_multiplier =
+        _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
+    for (int i = 0; i < 4; i += 2) {
+      const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+      const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
+      const __m128i b = _mm_adds_epi16(a, v_src);
+      StoreUnaligned16(&dst[i * step], b);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
+
+  const int shift = (tx_height < 16) ? 0 : 1;
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+  const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
+  const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
+  const __m128i b = _mm_srai_epi32(a, 12 + shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int16_t* source) {
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+
+  const __m128i v_multiplier_fraction =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+  const __m128i v_eight = _mm_set1_epi16(8);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_mult =
+            _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int16_t* source) {
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+
+  const __m128i v_multiplier_fraction =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
+      const __m128i v_src_mult2 =
+          _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+      const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+      const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+      const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_adds_epi16(frame_data16, b);
+      Store4(dst, _mm_packus_epi16(c, c));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_round =
+            _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+        const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
+        const __m128i v_src_mult2 =
+            _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+        const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+        const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_adds_epi16(frame_data16, b);
+        StoreLo8(dst + j, _mm_packus_epi16(c, c));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
+    StoreUnaligned16(&dst[h * step], v_src_mult);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+    // saturating add here is ok.
+    const __m128i a = _mm_adds_epi16(v_src, v_src);
+    StoreUnaligned16(&dst[h * step], a);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src =
+      _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
+  const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int16_t* source) {
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      const __m128i v_src = LoadLo8(&source[row]);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+      const __m128i frame_data = Load4(dst);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
+                                                int shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+  const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
+    const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
+    const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
+    const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
+    const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
+    const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
+    const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
+    const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
+    const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
+    const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
+    const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
+    const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
+    const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
+    StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
+    StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round0 =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+  const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+  const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+  const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+  const __m128i b = _mm_sra_epi32(a, v_shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int16_t* source) {
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const __m128i v_multiplier =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
+                                                  const int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int h = 0; h < 4; ++h) {
+    for (int i = 0; i < 32; i += 8) {
+      const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
+      // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+      // saturating add here is ok.
+      const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+      StoreUnaligned16(&dst[h * step + i], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
+  dst[0] = _mm_extract_epi16(v_dst_0, 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int16_t* source) {
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+  const __m128i v_two = _mm_set1_epi16(2);
+
+  int i = 0;
+  do {
+    const int row = i * tx_width;
+    int j = 0;
+    do {
+      const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
+      const __m128i frame_data = LoadLo8(dst + j);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
+      const __m128i b = _mm_srai_epi16(a, 2);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      StoreLo8(dst + j, _mm_packus_epi16(d, d));
+      j += 8;
+    } while (j < tx_width);
+    dst += stride;
+  } while (++i < tx_height);
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
+                                       const int start_x, const int start_y,
+                                       const void* source,
+                                       const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int16_t*>(source);
+  __m128i s[4], x[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int16_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int16_t g = f >> 1;
+    f = f - (f >> 1);
+    const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int16_t i = (src[0] >> 4);
+    s[0] = _mm_set1_epi16(h);
+    s[0] = _mm_insert_epi16(s[0], f, 0);
+    s[1] = _mm_set1_epi16(i);
+    s[1] = _mm_insert_epi16(s[1], g, 0);
+    s[2] = s[3] = s[1];
+  } else {
+    x[0] = LoadLo8(&src[0 * 4]);
+    x[2] = LoadLo8(&src[1 * 4]);
+    x[3] = LoadLo8(&src[2 * 4]);
+    x[1] = LoadLo8(&src[3 * 4]);
+
+    // Row transforms.
+    Transpose4x4_U16(x, x);
+    s[0] = _mm_srai_epi16(x[0], 2);
+    s[2] = _mm_srai_epi16(x[1], 2);
+    s[3] = _mm_srai_epi16(x[2], 2);
+    s[1] = _mm_srai_epi16(x[3], 2);
+    s[0] = _mm_add_epi16(s[0], s[2]);
+    s[3] = _mm_sub_epi16(s[3], s[1]);
+    __m128i e = _mm_sub_epi16(s[0], s[3]);
+    e = _mm_srai_epi16(e, 1);
+    s[1] = _mm_sub_epi16(e, s[1]);
+    s[2] = _mm_sub_epi16(e, s[2]);
+    s[0] = _mm_sub_epi16(s[0], s[1]);
+    s[3] = _mm_add_epi16(s[3], s[2]);
+    Transpose4x4_U16(s, s);
+
+    // Column transforms.
+    s[0] = _mm_add_epi16(s[0], s[2]);
+    s[3] = _mm_sub_epi16(s[3], s[1]);
+    e = _mm_sub_epi16(s[0], s[3]);
+    e = _mm_srai_epi16(e, 1);
+    s[1] = _mm_sub_epi16(e, s[1]);
+    s[2] = _mm_sub_epi16(e, s[2]);
+    s[0] = _mm_sub_epi16(s[0], s[1]);
+    s[3] = _mm_add_epi16(s[3], s[2]);
+  }
+
+  // Store to frame.
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+  for (int row = 0; row < 4; ++row) {
+    const __m128i frame_data = Load4(dst);
+    const __m128i a = _mm_cvtepu8_epi16(frame_data);
+    // Saturate to prevent overflowing int16_t
+    const __m128i b = _mm_adds_epi16(a, s[row]);
+    Store4(dst, _mm_packus_epi16(b, b));
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int16_t* source,
+    TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const int stride = frame.columns();
+  uint8_t* dst = frame[start_y] + start_x;
+  if (tx_width == 4) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const __m128i residual = LoadLo8(&source[row]);
+      const __m128i frame_data = Load4(dst);
+      // Saturate to prevent overflowing int16_t
+      const __m128i a = _mm_adds_epi16(residual, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    }
+  } else if (tx_width == 8) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+      const __m128i residual = LoadUnaligned16(&source[row]);
+      const __m128i frame_data = LoadLo8(dst);
+      // Saturate to prevent overflowing int16_t
+      const __m128i b = _mm_adds_epi16(residual, v_eight);
+      const __m128i c = _mm_srai_epi16(b, 4);
+      const __m128i d = _mm_cvtepu8_epi16(frame_data);
+      const __m128i e = _mm_adds_epi16(d, c);
+      StoreLo8(dst, _mm_packus_epi16(e, e));
+      dst += stride;
+    }
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const __m128i residual = LoadUnaligned16(&source[row + j]);
+        const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
+        const __m128i frame_data = LoadUnaligned16(frame[y] + x);
+        const __m128i b = _mm_adds_epi16(residual, v_eight);
+        const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
+        const __m128i c = _mm_srai_epi16(b, 4);
+        const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
+        const __m128i d = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
+        const __m128i e = _mm_adds_epi16(d, c);
+        const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
+        StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
+        j += 16;
+      } while (j < tx_width);
+    }
+  }
+}
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+  const __m128i word_reverse_8 =
+      _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      // read 16 shorts
+      const __m128i v3210 = LoadUnaligned16(&source[i]);
+      const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
+      const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
+      const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
+      StoreUnaligned16(&source[i], v4567);
+      StoreUnaligned16(&source[i + 8], v0123);
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
+      StoreUnaligned16(&source[i], b);
+    }
+  } else {
+    const __m128i dual_word_reverse_4 =
+        _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
+      StoreUnaligned16(&source[i], b);
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+      StoreUnaligned16(&source[i], b);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      // The last 32 values of every row are always zero if the |tx_width| is
+      // 64.
+      const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+      int j = 0;
+      do {
+        const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
+        const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+        StoreUnaligned16(&source[i * tx_width + j], b);
+        j += 8;
+      } while (j < non_zero_width);
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+                                    int row_shift) {
+  const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const __m128i residual = LoadUnaligned16(&source[i]);
+      const __m128i shifted_residual =
+          ShiftResidual(residual, v_row_shift_add, v_row_shift);
+      StoreUnaligned16(&source[i], shifted_residual);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      for (int j = 0; j < tx_width; j += 8) {
+        const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
+        const __m128i shifted_residual =
+            ShiftResidual(residual, v_row_shift_add, v_row_shift);
+        StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
+      }
+    } while (++i < num_rows);
+  }
+}
+
+void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = static_cast<int>(tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct4 rows in parallel.
+    Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
+                                            /*transpose=*/true);
+  } else {
+    // Process 8 1d dct4 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
+                                             /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct4 columns in parallel.
+      Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
+                                              /*transpose=*/false);
+    } else {
+      // Process 8 1d dct4 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
+                                               /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
+}
+
+void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct8 rows in parallel.
+    Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct8 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+                                              /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct8 columns in parallel.
+      Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct8 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
+}
+
+void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct16 rows in parallel.
+    Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+  } else {
+    int i = 0;
+    do {
+      // Process 8 1d dct16 rows in parallel per iteration.
+      Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+                                               /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  // row_shift is always non zero here.
+  RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct16 columns in parallel.
+      Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      int i = 0;
+      do {
+        // Process 8 1d dct16 columns in parallel per iteration.
+        Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                 /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
+}
+
+void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct32 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
+    i += 8;
+  } while (i < adjusted_tx_height);
+  // row_shift is always non zero here.
+  RowShift<32>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct32 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+      i += 8;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
+}
+
+void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct64 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
+    i += 8;
+  } while (i < adjusted_tx_height);
+  // row_shift is always non zero here.
+  RowShift<64>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct64 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+      i += 8;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
+}
+
+void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
+    i += 4;
+  } while (i < adjusted_tx_height);
+
+  if (row_shift != 0) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
+      i += 4;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 4, src, tx_type);
+}
+
+void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d adst8 rows in parallel.
+    Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
+                                            /*transpose=*/true);
+  } else {
+    // Process 8 1d adst8 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+                                               /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst8 columns in parallel.
+      Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d adst8 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                 /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 8, src, tx_type);
+}
+
+void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                   TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int /*start_x*/, int /*start_y*/,
+                                   void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d adst16 rows in parallel.
+    Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+  } else {
+    int i = 0;
+    do {
+      // Process 8 1d adst16 rows in parallel per iteration.
+      Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+                                                /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  // row_shift is always non zero here.
+  RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int start_x, int start_y,
+                                      void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst16 columns in parallel.
+      Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      int i = 0;
+      do {
+        // Process 8 1d adst16 columns in parallel per iteration.
+        Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                  /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 16, src, tx_type);
+}
+
+void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int /*start_x*/, int /*start_y*/,
+                                      void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+  if (tx_height < 16) {
+    int i = 0;
+    do {
+      Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
+      i += 4;
+    } while (i < adjusted_tx_height);
+  } else {
+    int i = 0;
+    do {
+      Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
+      i += 4;
+    } while (i < adjusted_tx_height);
+  }
+}
+
+void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* src_buffer, int start_x,
+                                         int start_y, void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                              adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int /*start_x*/, int /*start_y*/,
+                                      void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A.
+  if ((tx_height & 0x18) != 0) {
+    return;
+  }
+  if (tx_height == 32) {
+    int i = 0;
+    do {
+      Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
+      i += 4;
+    } while (i < adjusted_tx_height);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = 0;
+  do {
+    Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* src_buffer, int start_x,
+                                         int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+                                     adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int /*start_x*/, int /*start_y*/,
+                                       void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = 0;
+  do {
+    Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
+                         kTransformRowShift[tx_size]);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                          TransformSize tx_size,
+                                          int adjusted_tx_height,
+                                          void* src_buffer, int start_x,
+                                          int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+                                      adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int /*start_x*/, int /*start_y*/,
+                                       void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16. The src is always rounded before the
+  // identity transform and shifted by 1 afterwards.
+  auto* src = static_cast<int16_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = 0;
+  do {
+    Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
+                                          TransformSize tx_size,
+                                          int adjusted_tx_height,
+                                          void* src_buffer, int start_x,
+                                          int start_y, void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                               adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
+                                 int /*adjusted_tx_height*/,
+                                 void* /*src_buffer*/, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Do both row and column transforms in the column-transform pass.
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int16_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+template <typename Residual, typename Pixel>
+void InitAll(Dsp* const dsp) {
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      Dct4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      Dct4TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      Dct8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      Dct8TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      Dct16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      Dct16TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      Dct32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      Dct32TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      Dct64TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      Dct64TransformLoopColumn_SSE4_1;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      Adst4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      Adst4TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      Adst8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      Adst8TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      Adst16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      Adst16TransformLoopColumn_SSE4_1;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      Identity4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      Identity4TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      Identity8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      Identity8TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      Identity16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      Identity16TransformLoopColumn_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      Identity32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      Identity32TransformLoopColumn_SSE4_1;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      Wht4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      Wht4TransformLoopColumn_SSE4_1;
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<int16_t, uint8_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      Dct4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      Dct4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      Dct8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      Dct8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      Dct16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      Dct16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      Dct32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      Dct32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      Dct64TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      Dct64TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      Adst4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      Adst4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      Adst8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      Adst8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      Adst16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      Adst16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      Identity4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      Identity4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      Identity8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      Identity8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      Identity16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      Identity16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      Identity32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      Identity32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      Wht4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      Wht4TransformLoopColumn_SSE4_1;
+#endif
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/inverse_transform_sse4.h b/src/dsp/x86/inverse_transform_sse4.h
new file mode 100644
index 0000000..106084b
--- /dev/null
+++ b/src/dsp/x86/inverse_transform_sse4.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc
new file mode 100644
index 0000000..d67b450
--- /dev/null
+++ b/src/dsp/x86/loop_filter_sse4.cc
@@ -0,0 +1,2256 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i FilterAdd2Sub2(const __m128i& total, const __m128i& a1,
+                              const __m128i& a2, const __m128i& s1,
+                              const __m128i& s2) {
+  __m128i x = _mm_add_epi16(a1, total);
+  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(s1, s2)), a2);
+  return x;
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+                                  const __m128i& outer_thresh) {
+  const __m128i fe = _mm_set1_epi8(static_cast<int8_t>(0xfe));
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+  const __m128i a = _mm_adds_epu8(abs_pmq, abs_pmq);
+  const __m128i b = _mm_srli_epi16(_mm_and_si128(abs_pmq, fe), 1);
+  const __m128i c = _mm_adds_epu8(a, _mm_srli_si128(b, 4));
+  return _mm_subs_epu8(c, outer_thresh);
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+                   const __m128i& hev_thresh) {
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq =
+      _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4));
+  const __m128i hev_mask0 = _mm_cvtepu8_epi16(max_pq);
+  const __m128i hev_mask1 = _mm_cmpgt_epi16(hev_mask0, hev_thresh);
+  const __m128i hev_mask = _mm_packs_epi16(hev_mask1, hev_mask1);
+  return hev_mask;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi8(a, b);
+  const __m128i d = _mm_unpacklo_epi8(c, c);
+  const __m128i e = _mm_srai_epi16(d, 11); /* >> 3 */
+  return _mm_packs_epi16(e, e);
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi8(a, b);
+  const __m128i d = _mm_unpacklo_epi8(c, c);
+  const __m128i e = _mm_srai_epi16(d, 9); /* >> 1 */
+  return _mm_packs_epi16(e, e);
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+                    __m128i* oqp0, const __m128i& mask, const __m128i& hev) {
+  const __m128i t80 = _mm_set1_epi8(static_cast<int8_t>(0x80));
+  const __m128i t1 = _mm_set1_epi8(0x1);
+  const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i qps1qps0 = _mm_xor_si128(qp1qp0, t80);
+  const __m128i ps1qs0 = _mm_shuffle_epi32(qps1qps0, 0x09);
+  const __m128i qs1ps0 = _mm_shuffle_epi32(qps1qps0, 0x0c);
+  const __m128i _hev = _mm_unpacklo_epi32(hev, hev);
+  const __m128i x = _mm_subs_epi8(ps1qs0, qs1ps0);
+  __m128i a = _mm_and_si128(_mm_srli_si128(x, 4), _hev);
+
+  a = _mm_adds_epi8(a, x);
+  a = _mm_adds_epi8(a, x);
+  a = _mm_adds_epi8(a, x);
+  a = _mm_and_si128(a, mask);
+  a = _mm_unpacklo_epi32(a, a);
+
+  const __m128i t4t3 = _mm_set_epi32(0x0, 0x0, 0x04040404, 0x03030303);
+  const __m128i a1a2 = AddShift3(a, t4t3);
+  const __m128i a1a1 = _mm_shuffle_epi32(a1a2, 0x55);
+  const __m128i a3a3 = _mm_andnot_si128(_hev, AddShift1(a1a1, t1));
+  // -1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1
+  const __m128i adjust_sign_for_add =
+      _mm_unpacklo_epi32(t1, _mm_cmpeq_epi8(t1, t1));
+
+  const __m128i a3a3a1a2 = _mm_unpacklo_epi64(a1a2, a3a3);
+  const __m128i ma3a3ma1a2 = _mm_sign_epi8(a3a3a1a2, adjust_sign_for_add);
+
+  const __m128i b = _mm_adds_epi8(qps1qps0, ma3a3ma1a2);
+  const __m128i c = _mm_xor_si128(b, t80);
+
+  *oqp0 = c;
+  *oqp1 = _mm_srli_si128(c, 8);
+}
+
+void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh), 0);
+
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose4x4(const __m128i& x0, const __m128i& x1,
+                         const __m128i& x2, const __m128i& x3, __m128i* d0,
+                         __m128i* d1, __m128i* d2, __m128i* d3) {
+  // input
+  // x0   00 01 02 03 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // output
+  // d0   00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d1   01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d2   02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d3   03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  *d0 = _mm_unpacklo_epi16(w0, w1);
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(*d0, 4);
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0, 8);
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0, 12);
+}
+
+void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = Load4(dst - 2 + 0 * stride);
+  __m128i x1 = Load4(dst - 2 + 1 * stride);
+  __m128i x2 = Load4(dst - 2 + 2 * stride);
+  __m128i x3 = Load4(dst - 2 + 3 * stride);
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i d0 = _mm_unpacklo_epi16(w0, w1);
+  const __m128i qp1 = _mm_shuffle_epi32(d0, 0xc);
+  const __m128i qp0 = _mm_srli_si128(d0, 4);
+  const __m128i q1q0 = _mm_srli_si128(d0, 8);
+  const __m128i p1p0 = _mm_shuffle_epi32(d0, 0x1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i p1 = oqp1;
+  const __m128i p0 = oqp0;
+  const __m128i q0 = _mm_srli_si128(oqp0, 4);
+  const __m128i q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+  Store4(dst - 2 + 0 * stride, x0);
+  Store4(dst - 2 + 1 * stride, x1);
+  Store4(dst - 2 + 2 * stride, x2);
+  Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i NeedsFilter6(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp2, const __m128i& qp1,
+                            const __m128i& qp0, const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+                       const __m128i& qp0, const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i flat_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+  return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+                    __m128i* oqp1, __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f6_lo =
+      _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p2 * 3 + p1 * 2 + p0 * 2 + q0
+  // q2 * 3 + q1 * 2 + q0 * 2 + p0
+  *oqp1 = _mm_srli_epi16(f6_lo, 3);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+  // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+  f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f6_lo, 3);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3,
+                              __m128i* d4, __m128i* d5, __m128i* d6,
+                              __m128i* d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d0 = ww0;
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0, 4);
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0, 8);
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0, 12);
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d4 = ww1;
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1, 4);
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1, 8);
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1, 12);
+}
+
+void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadLo8(dst - 3 + 0 * stride);
+  __m128i x1 = LoadLo8(dst - 3 + 1 * stride);
+  __m128i x2 = LoadLo8(dst - 3 + 2 * stride);
+  __m128i x3 = LoadLo8(dst - 3 + 3 * stride);
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i z0, z1;  // not used
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 4);
+  q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+  Store4(dst - 2 + 0 * stride, x0);
+  Store4(dst - 2 + 1 * stride, x1);
+  Store4(dst - 2 + 2 * stride, x2);
+  Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+inline __m128i NeedsFilter8(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp3, const __m128i& qp2,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+  const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq2);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+                       const __m128i& qp1, const __m128i& qp0,
+                       const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+  const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq0);
+  const __m128i flat_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+  return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                    const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+                    __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f8_lo =
+      _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+  // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+  *oqp2 = _mm_srli_epi16(f8_lo, 3);
+  *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+  // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+  // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+  f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+  *oqp1 = _mm_srli_epi16(f8_lo, 3);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+  // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+  f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+  *oqp0 = _mm_srli_epi16(f8_lo, 3);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p3 = Load4(dst - 4 * stride);
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i q3 = Load4(dst + 3 * stride);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+    Store4(dst - 3 * stride, oqp2_f8);
+    Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x8To8x4(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3,
+                              const __m128i& x4, const __m128i& x5,
+                              const __m128i& x6, const __m128i& x7, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  const __m128i w2 = _mm_unpacklo_epi8(x4, x5);
+  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+  const __m128i w3 = _mm_unpacklo_epi8(x6, x7);
+
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+
+  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d0 = _mm_unpacklo_epi32(w4, w5);
+  *d1 = _mm_srli_si128(*d0, 8);
+  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d2 = _mm_unpackhi_epi32(w4, w5);
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadLo8(dst - 4 + 0 * stride);
+  __m128i x1 = LoadLo8(dst - 4 + 1 * stride);
+  __m128i x2 = LoadLo8(dst - 4 + 2 * stride);
+  __m128i x3 = LoadLo8(dst - 4 + 3 * stride);
+
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    p2 = oqp2_f8;
+    q2 = _mm_srli_si128(oqp2_f8, 4);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 4);
+  q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose8x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+  StoreLo8(dst - 4 + 0 * stride, x0);
+  StoreLo8(dst - 4 + 1 * stride, x1);
+  StoreLo8(dst - 4 + 2 * stride, x2);
+  StoreLo8(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+                     const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                     const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+                     __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+                     __m128i* oqp0) {
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i qp6_lo = _mm_cvtepu8_epi16(qp6);
+  const __m128i qp5_lo = _mm_cvtepu8_epi16(qp5);
+  const __m128i qp4_lo = _mm_cvtepu8_epi16(qp4);
+  const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+  const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+  const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f14_lo =
+      _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+                         _mm_add_epi16(qp5_lo, qp4_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+                         _mm_add_epi16(qp3_lo, qp2_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+                         _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+  // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+  *oqp5 = _mm_srli_epi16(f14_lo, 4);
+  *oqp5 = _mm_packus_epi16(*oqp5, *oqp5);
+
+  // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+  // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+  f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+  *oqp4 = _mm_srli_epi16(f14_lo, 4);
+  *oqp4 = _mm_packus_epi16(*oqp4, *oqp4);
+
+  // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+  // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+  f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+  *oqp3 = _mm_srli_epi16(f14_lo, 4);
+  *oqp3 = _mm_packus_epi16(*oqp3, *oqp3);
+
+  // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+  // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+  f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+  *oqp2 = _mm_srli_epi16(f14_lo, 4);
+  *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+  // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+  // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+  f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+  *oqp1 = _mm_srli_epi16(f14_lo, 4);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+  // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+  f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f14_lo, 4);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                  int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p3 = Load4(dst - 4 * stride);
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i q3 = Load4(dst + 3 * stride);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+    const __m128i p6 = Load4(dst - 7 * stride);
+    const __m128i p5 = Load4(dst - 6 * stride);
+    const __m128i p4 = Load4(dst - 5 * stride);
+    const __m128i q4 = Load4(dst + 4 * stride);
+    const __m128i q5 = Load4(dst + 5 * stride);
+    const __m128i q6 = Load4(dst + 6 * stride);
+    const __m128i qp6 = _mm_unpacklo_epi32(p6, q6);
+    const __m128i qp5 = _mm_unpacklo_epi32(p5, q5);
+    const __m128i qp4 = _mm_unpacklo_epi32(p4, q4);
+
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask =
+        _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask,
+                           _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+      Store4(dst - 6 * stride, oqp5_f14);
+      Store4(dst - 5 * stride, oqp4_f14);
+      Store4(dst - 4 * stride, oqp3_f14);
+      Store4(dst + 3 * stride, _mm_srli_si128(oqp3_f14, 4));
+      Store4(dst + 4 * stride, _mm_srli_si128(oqp4_f14, 4));
+      Store4(dst + 5 * stride, _mm_srli_si128(oqp5_f14, 4));
+    }
+
+    Store4(dst - 3 * stride, oqp2_f8);
+    Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+// Each of the 8x4 blocks of input data (p7-p0 and q0-q7) are transposed to 4x8,
+// then unpacked to the correct qp register. (qp7 - qp0)
+//
+// p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+//
+// 00 01 02 03 04 05 06 07  08 09 0a 0b 0c 0d 0e 0f
+// 10 11 12 13 14 15 16 17  18 19 1a 1b 1c 1d 1e 1f
+// 20 21 22 23 24 25 26 27  28 29 2a 2b 2c 2d 2e 2f
+// 30 31 32 33 34 35 36 37  38 39 3a 3b 3c 3d 3e 3f
+
+inline void DualTranspose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                                  const __m128i& x2, const __m128i& x3,
+                                  __m128i* q0p0, __m128i* q1p1, __m128i* q2p2,
+                                  __m128i* q3p3, __m128i* q4p4, __m128i* q5p5,
+                                  __m128i* q6p6, __m128i* q7p7) {
+  // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 08 18 09 19 0a 1a 0b 1b  0c 1c 0d 1d 0e 1e 0f 1f
+  const __m128i w2 = _mm_unpackhi_epi8(x0, x1);
+  // 28 38 29 39 2a 3a 2b 3b  2c 3c 2d 3d 2e 3e 2f 3f
+  const __m128i w3 = _mm_unpackhi_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+  // 08 18 28 38 09 19 29 39  0a 1a 2a 3a 0b 1b 2b 3b
+  const __m128i ww2 = _mm_unpacklo_epi16(w2, w3);
+  // 0c 1c 2c 3c 0d 1d 2d 3d  0e 1e 2e 3e 0f 1f 2f 3f
+  const __m128i ww3 = _mm_unpackhi_epi16(w2, w3);
+  // 00 10 20 30  0f 1f 2f 3f  xx xx xx xx xx xx xx xx
+  *q7p7 = _mm_unpacklo_epi32(ww0, _mm_srli_si128(ww3, 12));
+  // 01 11 21 31  0e 1e 2e 3e  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(_mm_slli_si128(ww0, 4), ww3);
+  // 02 12 22 32  0d 1d 2d 3d  xx xx xx xx xx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(ww0, _mm_slli_si128(ww3, 4));
+  // 03 13 23 33  0c 1c 2c 3c  xx xx xx xx xx xx xx xx
+  *q4p4 = _mm_unpacklo_epi32(_mm_srli_si128(ww0, 12), ww3);
+  // 04 14 24 34  0b 1b 2b 3b  xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(ww1, _mm_srli_si128(ww2, 12));
+  // 05 15 25 35  0a 1a 2a 3a  xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(_mm_slli_si128(ww1, 4), ww2);
+  // 06 16 26 36  09 19 29 39  xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(ww1, _mm_slli_si128(ww2, 4));
+  // 07 17 27 37  08 18 28 38  xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(_mm_srli_si128(ww1, 12), ww2);
+}
+
+inline void DualTranspose4x8To8x4(const __m128i& qp7, const __m128i& qp6,
+                                  const __m128i& qp5, const __m128i& qp4,
+                                  const __m128i& qp3, const __m128i& qp2,
+                                  const __m128i& qp1, const __m128i& qp0,
+                                  __m128i* x0, __m128i* x1, __m128i* x2,
+                                  __m128i* x3) {
+  // qp7: 00 10 20 30  0f 1f 2f 3f  xx xx xx xx xx xx xx xx
+  // qp6: 01 11 21 31  0e 1e 2e 3e  xx xx xx xx xx xx xx xx
+  // qp5: 02 12 22 32  0d 1d 2d 3d  xx xx xx xx xx xx xx xx
+  // qp4: 03 13 23 33  0c 1c 2c 3c  xx xx xx xx xx xx xx xx
+  // qp3: 04 14 24 34  0b 1b 2b 3b  xx xx xx xx xx xx xx xx
+  // qp2: 05 15 25 35  0a 1a 2a 3a  xx xx xx xx xx xx xx xx
+  // qp1: 06 16 26 36  09 19 29 39  xx xx xx xx xx xx xx xx
+  // qp0: 07 17 27 37  08 18 28 38  xx xx xx xx xx xx xx xx
+
+  // 00 01 10 11 20 21 30 31  0f 0e 1f 1e 2f 2e 3f 3e
+  const __m128i w0 = _mm_unpacklo_epi8(qp7, qp6);
+  // 02 03 12 13 22 23 32 33  xx xx xx xx xx xx xx xx
+  const __m128i w1 = _mm_unpacklo_epi8(qp5, qp4);
+  // 04 05 14 15 24 25 34 35  xx xx xx xx xx xx xx xx
+  const __m128i w2 = _mm_unpacklo_epi8(qp3, qp2);
+  // 06 07 16 17 26 27 36 37  xx xx xx xx xx xx xx xx
+  const __m128i w3 = _mm_unpacklo_epi8(qp1, qp0);
+  // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+  const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+  // 04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37
+  const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+  const __m128i d0 = _mm_unpacklo_epi32(w4, w5);
+  // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+  const __m128i d2 = _mm_unpackhi_epi32(w4, w5);
+  // xx xx xx xx xx xx xx xx 08 09 18 19 28 29 38 39
+  const __m128i w10 = _mm_unpacklo_epi8(qp0, qp1);
+  // xx xx xx xx xx xx xx xx 0a 0b 1a 1b 2a 2b 3a 3b
+  const __m128i w11 = _mm_unpacklo_epi8(qp2, qp3);
+  // xx xx xx xx xx xx xx xx 0c 0d 1c 1d 2c 2d 3c 3d
+  const __m128i w12 = _mm_unpacklo_epi8(qp4, qp5);
+  // xx xx xx xx xx xx xx xx 0e 0f 1e 1f 2e 2f 3e 3f
+  const __m128i w13 = _mm_unpacklo_epi8(qp6, qp7);
+  // 08 09 0a 0b 18 19 1a 1b 28 29 2a 2b 38 39 3a 3b
+  const __m128i w14 = _mm_unpackhi_epi16(w10, w11);
+  // 0c 0d 0e 0f 1c 1d 1e 1f 2c 2d 2e 2f 3c 3d 3e 3f
+  const __m128i w15 = _mm_unpackhi_epi16(w12, w13);
+  // 08 09 0a 0b 0c 0d 0e 0f 18 19 1a 1b 1c 1d 1e 1f
+  const __m128i d1 = _mm_unpacklo_epi32(w14, w15);
+  // 28 29 2a 2b 2c 2d 2e 2f 38 39 3a 3b 3c 3d 3e 3f
+  const __m128i d3 = _mm_unpackhi_epi32(w14, w15);
+
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  //
+  // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+  *x0 = _mm_unpacklo_epi64(d0, d1);
+  // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+  *x1 = _mm_unpackhi_epi64(d0, d1);
+  // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+  *x2 = _mm_unpacklo_epi64(d2, d3);
+  // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+  *x3 = _mm_unpackhi_epi64(d2, d3);
+}
+
+void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+  __m128i qp7, qp6, qp5, qp4, qp3, qp2, qp1, qp0;
+
+  DualTranspose8x4To4x8(x0, x1, x2, x3, &qp0, &qp1, &qp2, &qp3, &qp4, &qp5,
+                        &qp6, &qp7);
+
+  const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i q1q0 = _mm_shuffle_epi32(qp1qp0, 0x0d);
+  const __m128i p1p0 = _mm_shuffle_epi32(qp1qp0, 0x08);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask =
+        _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask,
+                           _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+      qp3 = oqp3_f14;
+      qp4 = oqp4_f14;
+      qp5 = oqp5_f14;
+    }
+    qp2 = oqp2_f8;
+  }
+
+  DualTranspose4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, &x2,
+                        &x3);
+
+  StoreUnaligned16(dst - 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 3 * stride, x3);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = Horizontal4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = Horizontal6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = Horizontal8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14;
+#endif
+}
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+namespace high_bitdepth {
+namespace {
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+struct LoopFilterFuncs_SSE4_1 {
+  LoopFilterFuncs_SSE4_1() = delete;
+
+  static constexpr int kThreshShift = bitdepth - 8;
+
+  static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                         int inner_thresh, int hev_thresh);
+  static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                           int inner_thresh, int hev_thresh);
+};
+
+inline __m128i Clamp(const __m128i& min, const __m128i& max,
+                     const __m128i& val) {
+  const __m128i a = _mm_min_epi16(val, max);
+  const __m128i b = _mm_max_epi16(a, min);
+  return b;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b,
+                         const __m128i& vmin, const __m128i& vmax) {
+  const __m128i c = _mm_adds_epi16(a, b);
+  const __m128i d = Clamp(vmin, vmax, c);
+  const __m128i e = _mm_srai_epi16(d, 3); /* >> 3 */
+  return e;
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi16(a, b);
+  const __m128i e = _mm_srai_epi16(c, 1); /* >> 1 */
+  return e;
+}
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+                   const __m128i& hev_thresh) {
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq =
+      _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+  const __m128i hev_mask = _mm_cmpgt_epi16(max_pq, hev_thresh);
+  return hev_mask;
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+                                  const __m128i& outer_thresh) {
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+  const __m128i a = _mm_adds_epu16(abs_pmq, abs_pmq);
+  const __m128i b = _mm_srli_epi16(abs_pmq, 1);
+  const __m128i c = _mm_adds_epu16(a, _mm_srli_si128(b, 8));
+  return _mm_subs_epu16(c, outer_thresh);
+}
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_abs_qp1mqp =
+      _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+  const __m128i inner_mask = _mm_subs_epu16(max_abs_qp1mqp, inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+                    __m128i* oqp0, const __m128i& mask, const __m128i& hev,
+                    int bitdepth) {
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t80 = _mm_set1_epi16(static_cast<int16_t>(1 << (bitdepth - 1)));
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  const __m128i vmin = _mm_subs_epi16(_mm_setzero_si128(), t80);
+  const __m128i vmax = _mm_subs_epi16(t80, t1);
+  const __m128i ps1 = _mm_subs_epi16(qp1, t80);
+  const __m128i ps0 = _mm_subs_epi16(qp0, t80);
+  const __m128i qs0 = _mm_srli_si128(ps0, 8);
+  const __m128i qs1 = _mm_srli_si128(ps1, 8);
+
+  __m128i a = _mm_subs_epi16(ps1, qs1);
+  a = _mm_and_si128(Clamp(vmin, vmax, a), hev);
+
+  const __m128i x = _mm_subs_epi16(qs0, ps0);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_and_si128(Clamp(vmin, vmax, a), mask);
+
+  const __m128i a1 = AddShift3(a, t4, vmin, vmax);
+  const __m128i a2 = AddShift3(a, t3, vmin, vmax);
+  const __m128i a3 = _mm_andnot_si128(hev, AddShift1(a1, t1));
+
+  const __m128i ops1 = _mm_adds_epi16(ps1, a3);
+  const __m128i ops0 = _mm_adds_epi16(ps0, a2);
+  const __m128i oqs0 = _mm_subs_epi16(qs0, a1);
+  const __m128i oqs1 = _mm_subs_epi16(qs1, a3);
+
+  __m128i oqps1 = _mm_unpacklo_epi64(ops1, oqs1);
+  __m128i oqps0 = _mm_unpacklo_epi64(ops0, oqs0);
+
+  oqps1 = Clamp(vmin, vmax, oqps1);
+  oqps0 = Clamp(vmin, vmax, oqps0);
+
+  *oqp1 = _mm_adds_epi16(oqps1, t80);
+  *oqp0 = _mm_adds_epi16(oqps0, t80);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal4(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i qp0 = LoadHi8(p0, dst + 0 * stride);
+  const __m128i qp1 = LoadHi8(p1, dst + 1 * stride);
+  const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+  const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical4(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+  const __m128i x0 = LoadLo8(dst - 2 + 0 * stride);
+  const __m128i x1 = LoadLo8(dst - 2 + 1 * stride);
+  const __m128i x2 = LoadLo8(dst - 2 + 2 * stride);
+  const __m128i x3 = LoadLo8(dst - 2 + 3 * stride);
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 00 10 20 30 01 11 21 31   p0p1
+  const __m128i a = _mm_unpacklo_epi32(w0, w1);
+  const __m128i p1p0 = _mm_shuffle_epi32(a, 0x4e);
+  // 02 12 22 32 03 13 23 33   q1q0
+  const __m128i q1q0 = _mm_unpackhi_epi32(w0, w1);
+  const __m128i qp1 = _mm_unpackhi_epi64(p1p0, q1q0);
+  const __m128i qp0 = _mm_unpacklo_epi64(p1p0, q1q0);
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+  // 00 10 20 30 01 11 21 31
+  const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+  StoreLo8(dst - 2 + 0 * stride, op0p1);
+  StoreHi8(dst - 2 + 1 * stride, op0p1);
+  StoreLo8(dst - 2 + 2 * stride, oq1q0);
+  StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i CheckOuterThreshF6(const __m128i& qp1, const __m128i& qp0,
+                                  const __m128i& outer_thresh) {
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+  const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+  return CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+}
+
+inline __m128i NeedsFilter6(const __m128i& qp2, const __m128i& qp1,
+                            const __m128i& qp0, const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i inner_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+                       const __m128i& qp0, const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i flat_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+  return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+                    __m128i* oqp1, __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f6_lo;
+  f6_lo =
+      _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p2 * 3 + p1 * 2 + p0 * 2 + q0
+  // q2 * 3 + q1 * 2 + q0 * 2 + p0
+  *oqp1 = _mm_srli_epi16(f6_lo, 3);
+
+  // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+  // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+  f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f6_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3,
+                              __m128i* d4, __m128i* d5, __m128i* d6,
+                              __m128i* d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07
+  // x1   10 11 12 13 14 15 16 17
+  // x2   20 21 22 23 24 25 26 27
+  // x3   30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 04 14 05 15 06 16 07 17
+  const __m128i w2 = _mm_unpackhi_epi16(x0, x1);
+  // 24 34 25 35 26 36 27 37
+  const __m128i w3 = _mm_unpackhi_epi16(x2, x3);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i ww0 = _mm_unpacklo_epi32(w0, w1);
+  // 04 14 24 34 05 15 25 35
+  const __m128i ww1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i ww2 = _mm_unpackhi_epi32(w0, w1);
+  // 06 16 26 36 07 17 27 37
+  const __m128i ww3 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 xx xx xx xx
+  *d0 = ww0;
+  // 01 11 21 31 xx xx xx xx
+  *d1 = _mm_srli_si128(ww0, 8);
+  // 02 12 22 32 xx xx xx xx
+  *d2 = ww2;
+  // 03 13 23 33 xx xx xx xx
+  *d3 = _mm_srli_si128(ww2, 8);
+  // 04 14 24 34 xx xx xx xx
+  *d4 = ww1;
+  // 05 15 25 35 xx xx xx xx
+  *d5 = _mm_srli_si128(ww1, 8);
+  // 06 16 26 36 xx xx xx xx
+  *d6 = ww3;
+  // 07 17 27 37 xx xx xx xx
+  *d7 = _mm_srli_si128(ww3, 8);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  __m128i x0 = LoadUnaligned16(dst - 3 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 3 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 3 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 3 + 3 * stride);
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i z0, z1;  // not used
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+  // 00 10 20 30 01 11 21 31
+  const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+  StoreLo8(dst - 2 + 0 * stride, op0p1);
+  StoreHi8(dst - 2 + 1 * stride, op0p1);
+  StoreLo8(dst - 2 + 2 * stride, oq1q0);
+  StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+inline __m128i NeedsFilter8(const __m128i& qp3, const __m128i& qp2,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+  const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq2);
+  const __m128i inner_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+                       const __m128i& qp1, const __m128i& qp0,
+                       const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+  const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq0);
+  const __m128i flat_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+  return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                    const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+                    __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp3_lo = qp3;
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f8_lo =
+      _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+  // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+  *oqp2 = _mm_srli_epi16(f8_lo, 3);
+
+  // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+  // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+  f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+  *oqp1 = _mm_srli_epi16(f8_lo, 3);
+
+  // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+  // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+  f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+  *oqp0 = _mm_srli_epi16(f8_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p3 = LoadLo8(dst - 4 * stride);
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+  const __m128i q3 = LoadLo8(dst + 3 * stride);
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+    StoreLo8(dst - 3 * stride, oqp2_f8);
+    StoreHi8(dst + 2 * stride, oqp2_f8);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeLower4x8To8x4(const __m128i& x0, const __m128i& x1,
+                                   const __m128i& x2, const __m128i& x3,
+                                   const __m128i& x4, const __m128i& x5,
+                                   const __m128i& x6, const __m128i& x7,
+                                   __m128i* d0, __m128i* d1, __m128i* d2,
+                                   __m128i* d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70
+  // d1 01 11 21 31 41 51 61 71
+  // d2 02 12 22 32 42 52 62 72
+  // d3 03 13 23 33 43 53 63 73
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 40 50 41 51 42 52 43 53
+  const __m128i w2 = _mm_unpacklo_epi16(x4, x5);
+  // 60 70 61 71 62 72 63 73
+  const __m128i w3 = _mm_unpacklo_epi16(x6, x7);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+  // 40 50 60 70 41 51 61 71
+  const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+  // 42 52 62 72 43 53 63 73
+  const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 40 50 60 70
+  *d0 = _mm_unpacklo_epi64(w4, w5);
+  // 01 11 21 31 41 51 61 71
+  *d1 = _mm_unpackhi_epi64(w4, w5);
+  // 02 12 22 32 42 52 62 72
+  *d2 = _mm_unpacklo_epi64(w6, w7);
+  // 03 13 23 33 43 53 63 73
+  *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  __m128i x0 = LoadUnaligned16(dst - 4 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 4 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 4 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 4 + 3 * stride);
+
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    p2 = oqp2_f8;
+    q2 = _mm_srli_si128(oqp2_f8, 8);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 8);
+  q1 = _mm_srli_si128(oqp1, 8);
+
+  TransposeLower4x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+  StoreUnaligned16(dst - 4 + 0 * stride, x0);
+  StoreUnaligned16(dst - 4 + 1 * stride, x1);
+  StoreUnaligned16(dst - 4 + 2 * stride, x2);
+  StoreUnaligned16(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+                     const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                     const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+                     __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+                     __m128i* oqp0) {
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i qp6_lo = qp6;
+  const __m128i qp5_lo = qp5;
+  const __m128i qp4_lo = qp4;
+  const __m128i qp3_lo = qp3;
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+  const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+  const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f14_lo =
+      _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+                         _mm_add_epi16(qp5_lo, qp4_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+                         _mm_add_epi16(qp3_lo, qp2_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+                         _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+  // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+  *oqp5 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+  // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+  f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+  *oqp4 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+  // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+  f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+  *oqp3 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+  // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+  f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+  *oqp2 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+  // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+  f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+  *oqp1 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+  // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+  f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f14_lo, 4);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
+                                                    ptrdiff_t stride8,
+                                                    int outer_thresh,
+                                                    int inner_thresh,
+                                                    int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p3 = LoadLo8(dst - 4 * stride);
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+  const __m128i q3 = LoadLo8(dst + 3 * stride);
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+    const __m128i p6 = LoadLo8(dst - 7 * stride);
+    const __m128i p5 = LoadLo8(dst - 6 * stride);
+    const __m128i p4 = LoadLo8(dst - 5 * stride);
+    const __m128i q4 = LoadLo8(dst + 4 * stride);
+    const __m128i q5 = LoadLo8(dst + 5 * stride);
+    const __m128i q6 = LoadLo8(dst + 6 * stride);
+    const __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+    const __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+    const __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+    const __m128i v_flat4_mask =
+        _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask,
+                           _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+      StoreLo8(dst - 6 * stride, oqp5_f14);
+      StoreLo8(dst - 5 * stride, oqp4_f14);
+      StoreLo8(dst - 4 * stride, oqp3_f14);
+
+      StoreHi8(dst + 3 * stride, oqp3_f14);
+      StoreHi8(dst + 4 * stride, oqp4_f14);
+      StoreHi8(dst + 5 * stride, oqp5_f14);
+    }
+
+    StoreLo8(dst - 3 * stride, oqp2_f8);
+    StoreHi8(dst + 2 * stride, oqp2_f8);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeUpper4x8To8x4(const __m128i& x0, const __m128i& x1,
+                                   const __m128i& x2, const __m128i& x3,
+                                   const __m128i& x4, const __m128i& x5,
+                                   const __m128i& x6, const __m128i& x7,
+                                   __m128i* d0, __m128i* d1, __m128i* d2,
+                                   __m128i* d3) {
+  // input
+  // x0 00 01 02 03 xx xx xx xx
+  // x1 10 11 12 13 xx xx xx xx
+  // x2 20 21 22 23 xx xx xx xx
+  // x3 30 31 32 33 xx xx xx xx
+  // x4 40 41 42 43 xx xx xx xx
+  // x5 50 51 52 53 xx xx xx xx
+  // x6 60 61 62 63 xx xx xx xx
+  // x7 70 71 72 73 xx xx xx xx
+  // output
+  // d0 00 10 20 30 40 50 60 70
+  // d1 01 11 21 31 41 51 61 71
+  // d2 02 12 22 32 42 52 62 72
+  // d3 03 13 23 33 43 53 63 73
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpackhi_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpackhi_epi16(x2, x3);
+  // 40 50 41 51 42 52 43 53
+  const __m128i w2 = _mm_unpackhi_epi16(x4, x5);
+  // 60 70 61 71 62 72 63 73
+  const __m128i w3 = _mm_unpackhi_epi16(x6, x7);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+  // 40 50 60 70 41 51 61 71
+  const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+  // 42 52 62 72 43 53 63 73
+  const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 40 50 60 70
+  *d0 = _mm_unpacklo_epi64(w4, w5);
+  // 01 11 21 31 41 51 61 71
+  *d1 = _mm_unpackhi_epi64(w4, w5);
+  // 02 12 22 32 42 52 62 72
+  *d2 = _mm_unpacklo_epi64(w6, w7);
+  // 03 13 23 33 43 53 63 73
+  *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
+                                                  int outer_thresh,
+                                                  int inner_thresh,
+                                                  int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  //
+  // 00 01 02 03 04 05 06 07  08 09 0a 0b 0c 0d 0e 0f
+  // 10 11 12 13 14 15 16 17  18 19 1a 1b 1c 1d 1e 1f
+  // 20 21 22 23 24 25 26 27  28 29 2a 2b 2c 2d 2e 2f
+  // 30 31 32 33 34 35 36 37  38 39 3a 3b 3c 3d 3e 3f
+
+  __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0;
+  __m128i q7, q6, q5, q4, q3, q2, q1, q0;
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+
+  x0 = LoadUnaligned16(dst - 8 + 8 + 0 * stride);
+  x1 = LoadUnaligned16(dst - 8 + 8 + 1 * stride);
+  x2 = LoadUnaligned16(dst - 8 + 8 + 2 * stride);
+  x3 = LoadUnaligned16(dst - 8 + 8 + 3 * stride);
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+
+  __m128i qp7 = _mm_unpacklo_epi64(p7, q7);
+  __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+  __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+  __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+  __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+    const __m128i v_flat4_mask =
+        _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask,
+                           _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+      qp3 = oqp3_f14;
+      qp4 = oqp4_f14;
+      qp5 = oqp5_f14;
+    }
+    qp2 = oqp2_f8;
+  }
+
+  TransposeLower4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1,
+                         &x2, &x3);
+
+  StoreUnaligned16(dst - 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 3 * stride, x3);
+
+  TransposeUpper4x8To8x4(oqp0, oqp1, qp2, qp3, qp4, qp5, qp6, qp7, &x0, &x1,
+                         &x2, &x3);
+
+  StoreUnaligned16(dst - 8 + 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 8 + 3 * stride, x3);
+}
+
+using Defs10bpp = LoopFilterFuncs_SSE4_1<kBitdepth10>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#endif
+}
+#endif
+}  // namespace
+}  // namespace high_bitdepth
+
+void LoopFilterInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_filter_sse4.h b/src/dsp/x86/loop_filter_sse4.h
new file mode 100644
index 0000000..4795d8b
--- /dev/null
+++ b/src/dsp/x86/loop_filter_sse4.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc
new file mode 100644
index 0000000..702bdea
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc
@@ -0,0 +1,592 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2],
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const __m256i offsets = _mm256_set1_epi16(-offset);
+  const __m256i limits = _mm256_set1_epi16(limit - offset);
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+  const __m256i sum0 = _mm256_add_epi32(s[0], round);
+  const __m256i sum1 = _mm256_add_epi32(s[1], round);
+  const __m256i rounded_sum0 =
+      _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal);
+  const __m256i rounded_sum1 =
+      _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal);
+  const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1);
+  const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets);
+  const __m256i d1 = _mm256_min_epi16(d0, limits);
+  StoreAligned32(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m256i s[7],
+                                       const __m256i filter[2],
+                                       int16_t* const wiener_buffer) {
+  const __m256i s06 = _mm256_add_epi16(s[0], s[6]);
+  const __m256i s15 = _mm256_add_epi16(s[1], s[5]);
+  const __m256i s24 = _mm256_add_epi16(s[2], s[4]);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15);
+  const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]);
+  const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]);
+  __m256i madds[4];
+  madds[0] = _mm256_madd_epi16(ss0, filter[0]);
+  madds[1] = _mm256_madd_epi16(ss1, filter[0]);
+  madds[2] = _mm256_madd_epi16(ss2, filter[1]);
+  madds[3] = _mm256_madd_epi16(ss3, filter[1]);
+  madds[0] = _mm256_add_epi32(madds[0], madds[2]);
+  madds[1] = _mm256_add_epi32(madds[1], madds[3]);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter,
+                                       int16_t* const wiener_buffer) {
+  const __m256i s04 = _mm256_add_epi16(s[0], s[4]);
+  const __m256i s13 = _mm256_add_epi16(s[1], s[3]);
+  const __m256i s2d = _mm256_add_epi16(s[2], s[2]);
+  const __m256i s0m = _mm256_sub_epi16(s04, s2d);
+  const __m256i s1m = _mm256_sub_epi16(s13, s2d);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m);
+  __m256i madds[2];
+  madds[0] = _mm256_madd_epi16(ss0, filter);
+  madds[1] = _mm256_madd_epi16(ss1, filter);
+  const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
+  const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
+  const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7);
+  const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7);
+  madds[0] = _mm256_add_epi32(madds[0], s2x128_lo);
+  madds[1] = _mm256_add_epi32(madds[1], s2x128_hi);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter,
+                                       int16_t* const wiener_buffer) {
+  const __m256i s02 = _mm256_add_epi16(s[0], s[2]);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]);
+  __m256i madds[2];
+  madds[0] = _mm256_madd_epi16(ss0, filter);
+  madds[1] = _mm256_madd_epi16(ss1, filter);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0);
+  filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[7];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      s[3] = LoadUnaligned32(src + x + 3);
+      s[4] = LoadUnaligned32(src + x + 4);
+      s[5] = LoadUnaligned32(src + x + 5);
+      s[6] = LoadUnaligned32(src + x + 6);
+      WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m256i filter =
+      _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[5];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      s[3] = LoadUnaligned32(src + x + 3);
+      s[4] = LoadUnaligned32(src + x + 4);
+      WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[3];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m256i s0 = LoadUnaligned32(src + x);
+      const __m256i d0 = _mm256_slli_epi16(s0, 4);
+      StoreAligned32(*wiener_buffer + x, d0);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+  const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]);
+  const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+  const __m256i madd23 = _mm256_add_epi32(madd2, madd3);
+  const __m256i sum = _mm256_add_epi32(madd01, madd23);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+  const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+  const __m256i sum = _mm256_add_epi32(madd01, madd2);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum = _mm256_add_epi32(madd0, madd1);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalClip(const __m256i s[2]) {
+  const __m256i d = _mm256_packus_epi32(s[0], s[1]);
+  return _mm256_min_epu16(d, _mm256_set1_epi16(1023));
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[4], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm256_unpacklo_epi16(a[4], a[5]);
+  b[3] = _mm256_unpacklo_epi16(a[6], round);
+  c[0] = WienerVertical7(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm256_unpackhi_epi16(a[4], a[5]);
+  b[3] = _mm256_unpackhi_epi16(a[6], round);
+  c[1] = WienerVertical7(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+                                     const __m256i filter[3]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[3], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm256_unpacklo_epi16(a[4], round);
+  c[0] = WienerVertical5(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm256_unpackhi_epi16(a[4], round);
+  c[1] = WienerVertical5(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[2], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], round);
+  c[0] = WienerVertical3(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], round);
+  c[1] = WienerVertical3(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[7]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[3], __m256i a[5]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[3]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[3], __m256i d[2]) {
+  __m256i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[4];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi32(c, 0x55);
+  filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+  filter[3] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d);
+      StoreUnaligned32(dst + x, d[0]);
+      StoreUnaligned32(dst + dst_stride + x, d[1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[7];
+      const __m256i d =
+          WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[3];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+  filter[2] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d);
+      StoreUnaligned32(dst + x, d[0]);
+      StoreUnaligned32(dst + dst_stride + x, d[1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[5];
+      const __m256i d =
+          WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  __m256i filter[2];
+  filter[0] =
+      _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  filter[1] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]);
+      StoreUnaligned32(dst + x, d[0][0]);
+      StoreUnaligned32(dst + dst_stride + x, d[0][1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[3];
+      const __m256i d =
+          WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const __m256i a = LoadAligned32(wiener_buffer);
+  const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8));
+  const __m256i c = _mm256_srai_epi16(b, 4);
+  const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256());
+  const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023));
+  StoreUnaligned32(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
+                       const void* const source, const void* const top_border,
+                       const void* const bottom_border, const ptrdiff_t stride,
+                       const int width, const int height,
+                       RestorationBuffer* const restoration_buffer,
+                       void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  const __m128i c =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+                         wiener_stride, height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+                         wiener_stride, height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+                         wiener_stride, height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+                         wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
new file mode 100644
index 0000000..0598435
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -0,0 +1,551 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2],
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const __m128i offsets = _mm_set1_epi16(-offset);
+  const __m128i limits = _mm_set1_epi16(limit - offset);
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+  const __m128i sum0 = _mm_add_epi32(s[0], round);
+  const __m128i sum1 = _mm_add_epi32(s[1], round);
+  const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
+  const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
+  const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
+  const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
+  const __m128i d1 = _mm_min_epi16(d0, limits);
+  StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
+  filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[7], madds[4];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      s[5] = LoadUnaligned16(src + x + 5);
+      s[6] = LoadUnaligned16(src + x + 6);
+      const __m128i s06 = _mm_add_epi16(s[0], s[6]);
+      const __m128i s15 = _mm_add_epi16(s[1], s[5]);
+      const __m128i s24 = _mm_add_epi16(s[2], s[4]);
+      const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
+      const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
+      const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
+      const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
+      madds[0] = _mm_madd_epi16(ss0, filter[0]);
+      madds[1] = _mm_madd_epi16(ss1, filter[0]);
+      madds[2] = _mm_madd_epi16(ss2, filter[1]);
+      madds[3] = _mm_madd_epi16(ss3, filter[1]);
+      madds[0] = _mm_add_epi32(madds[0], madds[2]);
+      madds[1] = _mm_add_epi32(madds[1], madds[3]);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i filter =
+      _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[5], madds[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      const __m128i s04 = _mm_add_epi16(s[0], s[4]);
+      const __m128i s13 = _mm_add_epi16(s[1], s[3]);
+      const __m128i s2d = _mm_add_epi16(s[2], s[2]);
+      const __m128i s0m = _mm_sub_epi16(s04, s2d);
+      const __m128i s1m = _mm_sub_epi16(s13, s2d);
+      const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
+      const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
+      madds[0] = _mm_madd_epi16(ss0, filter);
+      madds[1] = _mm_madd_epi16(ss1, filter);
+      const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
+      const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
+      const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
+      const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
+      madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
+      madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[3], madds[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      const __m128i s02 = _mm_add_epi16(s[0], s[2]);
+      const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
+      const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
+      madds[0] = _mm_madd_epi16(ss0, filter);
+      madds[1] = _mm_madd_epi16(ss1, filter);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m128i s = LoadUnaligned16(src + x);
+      const __m128i d = _mm_slli_epi16(s, 4);
+      StoreAligned16(*wiener_buffer + x, d);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+  const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
+  const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+  const __m128i madd23 = _mm_add_epi32(madd2, madd3);
+  const __m128i sum = _mm_add_epi32(madd01, madd23);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+  const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+  const __m128i sum = _mm_add_epi32(madd01, madd2);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum = _mm_add_epi32(madd0, madd1);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalClip(const __m128i s[2]) {
+  const __m128i d = _mm_packus_epi32(s[0], s[1]);
+  return _mm_min_epu16(d, _mm_set1_epi16(1023));
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[4], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm_unpacklo_epi16(a[4], a[5]);
+  b[3] = _mm_unpacklo_epi16(a[6], round);
+  c[0] = WienerVertical7(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm_unpackhi_epi16(a[4], a[5]);
+  b[3] = _mm_unpackhi_epi16(a[6], round);
+  c[1] = WienerVertical7(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+                                     const __m128i filter[3]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[3], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm_unpacklo_epi16(a[4], round);
+  c[0] = WienerVertical5(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm_unpackhi_epi16(a[4], round);
+  c[1] = WienerVertical5(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[2], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], round);
+  c[0] = WienerVertical3(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], round);
+  c[1] = WienerVertical3(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[7]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[3], __m128i a[5]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[3]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[4];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi32(c, 0x55);
+  filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+  filter[3] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[8], d[2];
+      d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
+      d[1] = WienerVerticalFilter7(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[7];
+      const __m128i d =
+          WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[3];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+  filter[2] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[6], d[2];
+      d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
+      d[1] = WienerVerticalFilter5(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[5];
+      const __m128i d =
+          WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  __m128i filter[2];
+  filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  filter[1] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[4], d[2];
+      d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
+      d[1] = WienerVerticalFilter3(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[3];
+      const __m128i d =
+          WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const __m128i a = LoadAligned16(wiener_buffer);
+  const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
+  const __m128i c = _mm_srai_epi16(b, 4);
+  const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
+  const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
+  StoreAligned16(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
+                         const void* const source, const void* const top_border,
+                         const void* const bottom_border,
+                         const ptrdiff_t stride, const int width,
+                         const int height,
+                         RestorationBuffer* const restoration_buffer,
+                         void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  const __m128i coefficients_horizontal =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+                         wiener_stride, height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+                         wiener_stride, height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+                         wiener_stride, height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+                         wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+  static_cast<void>(WienerFilter_SSE4_1);
+#endif
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc
new file mode 100644
index 0000000..7ae7c90
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_avx2.cc
@@ -0,0 +1,2902 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128,
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit =
+      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+  const __m256i offsets = _mm256_set1_epi16(-offset);
+  const __m256i limits = _mm256_set1_epi16(limit - offset);
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
+  // The sum range here is [-128 * 255, 90 * 255].
+  const __m256i madd = _mm256_add_epi16(s[0], s[1]);
+  const __m256i sum = _mm256_add_epi16(madd, round);
+  const __m256i rounded_sum0 =
+      _mm256_srai_epi16(sum, kInterRoundBitsHorizontal);
+  // Add back scaled down offset correction.
+  const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128);
+  const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets);
+  const __m256i d1 = _mm256_min_epi16(d0, limits);
+  StoreAligned32(wiener_buffer, d1);
+}
+
+// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking,
+// because the compiler generates redundant code when loading all and unpacking.
+inline void WienerHorizontalTap7Kernel(const __m256i s[2],
+                                       const __m256i filter[4],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+  const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13);
+  __m256i madds[4];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+  madds[3] = _mm256_maddubs_epi16(s67, filter[3]);
+  madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+  madds[1] = _mm256_add_epi16(madds[1], madds[3]);
+  const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8),
+                                            7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[2],
+                                       const __m256i filter[3],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+  __m256i madds[3];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+  madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+  const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8),
+                                            kInterRoundBitsHorizontal + 1);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[2],
+                                       const __m256i filter[2],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  __m256i madds[2];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8),
+                                            7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[4];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+  filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
+  filter[3] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8000));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[3];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
+  filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8001));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8002));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m256i s = LoadUnaligned32(src + x);
+      const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256());
+      const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256());
+      __m256i d[2];
+      d[0] = _mm256_slli_epi16(s0, 4);
+      d[1] = _mm256_slli_epi16(s1, 4);
+      StoreAligned64(*wiener_buffer + x, d);
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum0 = _mm256_add_epi32(round, madd0);
+  const __m256i sum1 = _mm256_add_epi32(sum0, madd1);
+  return _mm256_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum = _mm256_add_epi32(madd0, madd1);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a, const __m256i filter) {
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m256i madd = _mm256_madd_epi16(a, filter);
+  const __m256i sum = _mm256_add_epi32(round, madd);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+                                     const __m256i filter[2]) {
+  __m256i b[2];
+  const __m256i a06 = _mm256_add_epi16(a[0], a[6]);
+  const __m256i a15 = _mm256_add_epi16(a[1], a[5]);
+  const __m256i a24 = _mm256_add_epi16(a[2], a[4]);
+  b[0] = _mm256_unpacklo_epi16(a06, a15);
+  b[1] = _mm256_unpacklo_epi16(a24, a[3]);
+  const __m256i sum0 = WienerVertical7(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a06, a15);
+  b[1] = _mm256_unpackhi_epi16(a24, a[3]);
+  const __m256i sum1 = WienerVertical7(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[2];
+  const __m256i a04 = _mm256_add_epi16(a[0], a[4]);
+  const __m256i a13 = _mm256_add_epi16(a[1], a[3]);
+  b[0] = _mm256_unpacklo_epi16(a04, a13);
+  b[1] = _mm256_unpacklo_epi16(a[2], round);
+  const __m256i sum0 = WienerVertical5(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a04, a13);
+  b[1] = _mm256_unpackhi_epi16(a[2], round);
+  const __m256i sum1 = WienerVertical5(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) {
+  __m256i b;
+  const __m256i a02 = _mm256_add_epi16(a[0], a[2]);
+  b = _mm256_unpacklo_epi16(a02, a[1]);
+  const __m256i sum0 = WienerVertical3(b, filter);
+  b = _mm256_unpackhi_epi16(a02, a[1]);
+  const __m256i sum1 = WienerVertical3(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[7]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[5]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter, __m256i a[3]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter, __m256i d[2]) {
+  __m256i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi32(c, 0x55);
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[7];
+      const __m256i d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients));
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(c, 0);
+  filter[1] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[5];
+      const __m256i d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i filter =
+      _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[3];
+      const __m256i d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const __m256i a0 = LoadAligned32(wiener_buffer + 0);
+  const __m256i a1 = LoadAligned32(wiener_buffer + 16);
+  const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8));
+  const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8));
+  const __m256i c0 = _mm256_srai_epi16(b0, 4);
+  const __m256i c1 = _mm256_srai_epi16(b1, 4);
+  const __m256i d = _mm256_packus_epi16(c0, c1);
+  StoreUnaligned32(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 32;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
+                       const void* const source, const void* const top_border,
+                       const void* const bottom_border, const ptrdiff_t stride,
+                       const int width, const int height,
+                       RestorationBuffer* const restoration_buffer,
+                       void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 32);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  const __m128i c =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  __m128i c_horizontal =
+      _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+  c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
+  const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+                         wiener_stride, height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+                         wiener_stride, height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+                         wiener_stride, height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+                         wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 10;
+constexpr int kOverreadInBytesPass2_128 = 12;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+  dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+  dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+  LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+  LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(src0, s1);
+}
+
+// Using VgetLane16() can save a sign extension instruction.
+template <int n>
+inline int VgetLane16(__m256i src) {
+  return _mm256_extract_epi16(src, n);
+}
+
+template <int n>
+inline int VgetLane8(__m256i src) {
+  return _mm256_extract_epi8(src, n);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareLo8(const __m256i src) {
+  const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
+  return _mm256_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+  const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareHi8(const __m256i src) {
+  const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
+  return _mm256_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+  dst[3] = _mm_srli_si128(src, 3);
+  dst[4] = _mm_srli_si128(src, 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm256_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi16(src0, src1);
+  return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi32(src0, src1);
+  return _mm256_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+  const __m256i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+  const __m256i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+  const __m128i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m256i Sum3WLo32(const __m256i src[3]) {
+  const __m256i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+  const __m128i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m256i Sum3WHi32(const __m256i src[3]) {
+  const __m256i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+  const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+  const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+                       const __m256i* const src2, const __m256i* const src3,
+                       const __m256i* const src4) {
+  const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+  const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+  const __m256i sum = _mm256_add_epi32(sum01, sum23);
+  return _mm256_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlLo8(src[0], src[1]);
+  const __m128i sum23 = VaddlLo8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WLo16(const __m256i src[5]) {
+  const __m256i sum01 = VaddlLo8(src[0], src[1]);
+  const __m256i sum23 = VaddlLo8(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WHi16(const __m256i src[5]) {
+  const __m256i sum01 = VaddlHi8(src[0], src[1]);
+  const __m256i sum23 = VaddlHi8(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+  __m128i s[3];
+  Prepare3Lo8(src, s);
+  return Sum3WLo16(s);
+}
+
+inline void Sum3Horizontal(const uint8_t* const src,
+                           const ptrdiff_t over_read_in_bytes, __m256i dst[2]) {
+  __m256i s[3];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  return Sum5WLo16(s);
+}
+
+inline void Sum5Horizontal(const uint8_t* const src,
+                           const ptrdiff_t over_read_in_bytes,
+                           __m256i* const dst0, __m256i* const dst1) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[5];
+  Prepare5_16(src, s);
+  const __m256i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m256i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m256i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m256i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3,
+                     __m256i* const row_sq5) {
+  const __m256i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3,
+                     __m256i* const row_sq5) {
+  const __m256i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  const __m128i sum04 = VaddlLo8(s[0], s[4]);
+  *row3 = Sum3WLo16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal(const uint8_t* const src,
+                          const ptrdiff_t over_read_in_bytes,
+                          __m256i* const row3_0, __m256i* const row3_1,
+                          __m256i* const row5_0, __m256i* const row5_1) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+  const __m256i sum04_lo = VaddlLo8(s[0], s[4]);
+  const __m256i sum04_hi = VaddlHi8(s[0], s[4]);
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0);
+  *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+                          __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                          __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0,
+                          __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+                          __m256i* const row_sq5_1) {
+  __m256i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WLo16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WHi16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343WLo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwLo16(sum3, src[1]);
+}
+
+inline __m256i Sum343WHi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum343WLo(s);
+  dst[1] = Sum343WHi(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565WLo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, src[1]);
+}
+
+inline __m256i Sum565WHi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum565WLo(s);
+  dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    const __m128i s0 =
+        LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
+    __m128i sq_128[2];
+    __m256i sq[3];
+    __m128i s3, s5, sq3[2], sq5[2];
+    sq_128[0] = SquareLo8(s0);
+    sq_128[1] = SquareHi8(s0);
+    SumHorizontalLo(s0, &s3, &s5);
+    StoreAligned16(sum3, s3);
+    StoreAligned16(sum5, s5);
+    SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+    StoreAligned32U32(square_sum3, sq3);
+    StoreAligned32U32(square_sum5, sq5);
+    src += 8;
+    sum3 += 8;
+    sum5 += 8;
+    square_sum3 += 8;
+    square_sum5 += 8;
+    sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      const __m256i s = LoadUnaligned32Msan(
+          src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width);
+      sq[1] = SquareLo8(s);
+      sq[2] = SquareHi8(s);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width,
+                    &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned64(sum3, row3);
+      StoreAligned64(sum5, row5);
+      SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned64(square_sum3 + 0, row_sq3);
+      StoreAligned64(square_sum5 + 0, row_sq5);
+      SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned64(square_sum3 + 16, row_sq3);
+      StoreAligned64(square_sum5 + 16, row_sq5);
+      sq[0] = sq[2];
+      src += 32;
+      sum3 += 32;
+      sum5 += 32;
+      square_sum3 += 32;
+      square_sum5 += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sum3 += sum_stride - sum_width - 8;
+    sum5 += sum_stride - sum_width - 8;
+    square_sum3 += sum_stride - sum_width - 8;
+    square_sum5 += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int kOverreadInBytes_128, kOverreadInBytes_256;
+  if (size == 3) {
+    kOverreadInBytes_128 = kOverreadInBytesPass2_128;
+    kOverreadInBytes_256 = kOverreadInBytesPass2_256;
+  } else {
+    kOverreadInBytes_128 = kOverreadInBytesPass1_128;
+    kOverreadInBytes_256 = kOverreadInBytesPass1_256;
+  }
+  int y = 2;
+  do {
+    const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width);
+    __m128i ss, sq_128[2], sqs[2];
+    __m256i sq[3];
+    sq_128[0] = SquareLo8(s);
+    sq_128[1] = SquareHi8(s);
+    if (size == 3) {
+      ss = Sum3Horizontal(s);
+      Sum3WHorizontal(sq_128, sqs);
+    } else {
+      ss = Sum5Horizontal(s);
+      Sum5WHorizontal(sq_128, sqs);
+    }
+    StoreAligned16(sums, ss);
+    StoreAligned32U32(square_sums, sqs);
+    src += 8;
+    sums += 8;
+    square_sums += 8;
+    sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i row[2], row_sq[4];
+      const __m256i s = LoadUnaligned32Msan(
+          src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width);
+      sq[1] = SquareLo8(s);
+      sq[2] = SquareHi8(s);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      if (size == 3) {
+        Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+                       row);
+        Sum3WHorizontal(sq + 0, row_sq + 0);
+        Sum3WHorizontal(sq + 1, row_sq + 2);
+      } else {
+        Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+                       &row[0], &row[1]);
+        Sum5WHorizontal(sq + 0, row_sq + 0);
+        Sum5WHorizontal(sq + 1, row_sq + 2);
+      }
+      StoreAligned64(sums, row);
+      StoreAligned64(square_sums + 0, row_sq + 0);
+      StoreAligned64(square_sums + 16, row_sq + 2);
+      sq[0] = sq[2];
+      src += 32;
+      sums += 32;
+      square_sums += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sums += sum_stride - sum_width - 8;
+    square_sums += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m256i dxd = _mm256_madd_epi16(sum, sum);
+  // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+  __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+  const __m256i sub = _mm256_sub_epi32(axn, dxd);
+  const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+  const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256());
+  const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256());
+  const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m256i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm256_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
+  static_assert(n == 9 || n == 25, "");
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+template <int n>
+inline __m256i CalculateB(const __m256i sum, const __m256i ma) {
+  static_assert(n == 9 || n == 25, "");
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  const __m256i m0 = VmullLo16(ma, sum);
+  const __m256i m1 = VmullHi16(ma, sum);
+  const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+  const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+  const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i* const b) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  *b = CalculateB<n>(sum, maq);
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparision instruction, or from the sign bit of the index.
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+  __m256i mask;
+  mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+  mask = _mm256_or_si256(mask, index);
+  return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+                           const int threshold) {
+  const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+  const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+  return _mm256_add_epi8(value, offset);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+                                  __m256i ma[3], __m256i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  // Use table lookup to read elements which indices are less than 48.
+  const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+  const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+  const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+  const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
+  __m256i idx, mas;
+  // Clip idx to 127 to apply signed comparision instructions.
+  idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+  // All elements which indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  mas = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  mas = _mm256_or_si256(mas, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res2 = ShuffleIndex(c2, idx);
+  mas = _mm256_or_si256(mas, res2);
+
+  // For elements which indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparision instructions.
+  idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+  // Elements which indices are larger than 47 (with value 0) are set to 5.
+  mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+  mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
+  mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
+  mas = AdjustValue(mas, idx, 101);  // 101 is the last index which value is 3.
+  mas = AdjustValue(mas, idx, 169);  // 169 is the last index which value is 2.
+  mas = AdjustValue(mas, idx, 254);  // 254 is the last index which value is 1.
+
+  ma[2] = _mm256_permute4x64_epi64(mas, 0x93);     // 32-39 8-15 16-23 24-31
+  ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc);  //  0-7  8-15 16-23 24-31
+  ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+  const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+  b[0] = CalculateB<n>(sum[0], maq0);
+  b[1] = CalculateB<n>(sum[1], maq1);
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[2], const ptrdiff_t x,
+                         __m256i sum_b343[2], __m256i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m256i b[3], sum_b111[2];
+  Prepare3_16(b3, b);
+  sum_b111[0] = Sum3WLo32(b);
+  sum_b111[1] = Sum3WHi32(b);
+  sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+  sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+  StoreAligned64(b444 + x, sum_b444);
+  sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+  sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+  StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned32(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned32(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[0][3] = Sum5Horizontal(s[0][0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal(s[1][0]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+    const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3],
+    __m256i b[3]) {
+  const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+  const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+  sq[0][1] = SquareLo8(s0);
+  sq[0][2] = SquareHi8(s0);
+  sq[1][1] = SquareLo8(s1);
+  sq[1][2] = SquareHi8(s1);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+  Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  Sum5WHorizontal(sq[0] + 1, sq5[3]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  Sum5WHorizontal(sq[1] + 1, sq5[4]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[5], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  s5[3] = s5[4] = Sum5Horizontal(s);
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+  const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+  sq[1] = SquareLo8(s);
+  sq[2] = SquareHi8(s);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  Sum5WHorizontal(sq + 1, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s3[3], sq3[3][2];
+  sq[1] = SquareHi8(s);
+  s3[2] = Sum3Horizontal(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3WHorizontal(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3],
+    __m256i ma[3], __m256i b[3]) {
+  const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s3[4], sq3[3][2], sum[2], index[2];
+  sq[1] = SquareLo8(s);
+  sq[2] = SquareHi8(s);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  Sum3Horizontal(src, over_read_in_bytes, s3 + 2);
+  StoreAligned64(sum3[2] + x, s3 + 2);
+  Sum3WHorizontal(sq + 0, sq3[2]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  LoadAligned32x2U16(sum3, x, s3);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Sum3WHorizontal(sq + 1, sq3[2]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate<9>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2],
+    __m128i b3[2], __m128i* const ma5, __m128i* const b5) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0]);
+  sq[1][1] = SquareHi8(s[1]);
+  SumHorizontalLo(s[0], &s3[2], &s5[3]);
+  SumHorizontalLo(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  // Note: in the SSE4_1 version, CalculateIntermediate() is called
+  // to replace the slow LookupIntermediate() when calculating 16 intermediate
+  // data points. However, the AVX2 compiler generates even slower code. So we
+  // keep using CalculateIntermediate3().
+  CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]);
+  CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3],
+    __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+  const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+  __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sq3t[4][2], sq5t[5][2],
+      sum_3[2][2], index_3[2][2], sum_5[2], index_5[2];
+  sq[0][1] = SquareLo8(s0);
+  sq[0][2] = SquareHi8(s0);
+  sq[1][1] = SquareLo8(s1);
+  sq[1][2] = SquareHi8(s1);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                &s5[1][3]);
+  SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+                &s5[1][4]);
+  StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+  StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  StoreAligned64(square_sum3[3] + x, sq3[3]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+                        &index_3[1][0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  SumHorizontal(sq[0] + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
+  SumHorizontal(sq[1] + 1, &sq3t[3][0], &sq3t[3][1], &sq5t[4][0], &sq5t[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3t[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5t[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3t[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5t[4]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
+  CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3t + 1, scales[1], &sum_3[1][1],
+                        &index_3[1][1]);
+  CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
+  CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
+  CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+  b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
+  b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
+  b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+    __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  SumHorizontalLo(s, &s3[2], &s5[3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+    __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq3t[4][2], sq5[5][2], sq5t[5][2],
+      sum_3[2], index_3[2], sum_5[2], index_5[2];
+  sq[1] = SquareLo8(s0);
+  sq[2] = SquareHi8(s0);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                &s5[1][3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  SumHorizontal(sq + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
+  CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[1], &index_3[1]);
+  CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
+  sq5t[4][0] = sq5t[3][0];
+  sq5t[4][1] = sq5t[3][1];
+  CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+  b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
+  b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  __m128i ma0, b0, s[2][3], sq_128[2][2];
+  __m256i mas[3], sq[2][3], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0][0]);
+  sq_128[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[2], b[4];
+    BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+                         x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+                         x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned64(ma565, ma);
+    Sum565W(bs + 0, b + 0);
+    Sum565W(bs + 1, b + 2);
+    StoreAligned64(b565, b + 0);
+    StoreAligned64(b565 + 16, b + 2);
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  __m128i ma0, sq_128[2], b0;
+  __m256i mas[3], sq[3], bs[3];
+  const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
+  sq_128[0] = SquareLo8(s);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma3[3];
+    BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width,
+                         x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    Prepare3_8(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444);
+      ma444 += 32;
+      b444 += 32;
+    } else {
+      __m256i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned64(ma343, ma);
+      Sum343W(bs + 0, b + 0);
+      Sum343W(bs + 1, b + 2);
+      StoreAligned64(b343 + 0, b + 0);
+      StoreAligned64(b343 + 16, b + 2);
+    }
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma343 += 32;
+    b343 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
+    uint32_t* const b444[2], uint32_t* b565) {
+  __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+  __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0]);
+  sq_128[1][0] = SquareLo8(s[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, &b5_0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+  ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[2], b[4], ma3x[3], ma5x[3];
+    BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+                        x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+                        scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+                        sq, ma3, b3, ma5, b5);
+    Prepare3_8(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned64(ma343[0] + x, ma);
+    Sum343W(b3[0], b);
+    StoreAligned64(b343[0] + x, b);
+    Sum565W(b5, b);
+    StoreAligned64(b565, b);
+    Prepare3_8(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444[0], b343[1],
+                   b444[0]);
+    Prepare3_8(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned64(ma565, ma);
+    Sum343W(b3[0] + 1, b);
+    StoreAligned64(b343[0] + x + 16, b);
+    Sum565W(b5 + 1, b);
+    StoreAligned64(b565 + 16, b);
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+                                       const __m256i b[2]) {
+  const __m256i ma_x_src_lo = VmullLo16(ma, src);
+  const __m256i ma_x_src_hi = VmullHi16(ma, src);
+  const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2],
+                                            __m256i b[2][2]) {
+  const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+  __m256i b_sum[2];
+  b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src, __m256i ma[3],
+                                            __m256i b[3][2]) {
+  const __m256i ma_sum = Sum3_16(ma);
+  __m256i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+  const __m256i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+  return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+                                          const __m256i filter[2], const int w0,
+                                          const int w2) {
+  __m256i v[2];
+  const __m256i w0_w2 =
+      _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+  const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+                                          const __m256i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m256i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i ma0, b0, s[2][3], sq_128[2][2];
+  __m256i mas[3], sq[2][3], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0][0]);
+  sq_128[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], ma3[3], b[2][2][2];
+    BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+                         x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+                         x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    ma[1] = Sum565Lo(ma3);
+    ma[2] = Sum565Hi(ma3);
+    StoreAligned64(ma565[1] + x, ma + 1);
+    Sum565W(bs + 0, b[0][1]);
+    Sum565W(bs + 1, b[1][1]);
+    StoreAligned64(b565[1] + x + 0, b[0][1]);
+    StoreAligned64(b565[1] + x + 16, b[1][1]);
+    const __m256i sr0 = LoadUnaligned32(src + x);
+    const __m256i sr1 = LoadUnaligned32(src + stride + x);
+    const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]);
+    const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]);
+    const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0);
+    const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0);
+    const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+    ma[1] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[1][0]);
+    const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]);
+    const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]);
+    const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0);
+    const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+    StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  __m128i ma0, b0, sq_128[2];
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0,
+                                &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], ma5[3], b[2][2];
+    BoxFilterPreProcess5LastRow(
+        src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    ma[2] = Sum565Hi(ma5);
+    Sum565W(bs + 0, b[1]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565);
+    LoadAligned64(b565 + 0, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma565 + 16);
+    LoadAligned64(b565 + 16, b[0]);
+    Sum565W(bs + 1, b[1]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width);
+  __m128i ma0, b0, sq_128[2];
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[4], b[4][2], ma3[3];
+    BoxFilterPreProcess3(src0 + x + 8,
+                         x + 8 + kOverreadInBytesPass2_256 - width, x + 8,
+                         sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma343[0] + x + 16);
+    ma[2] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1]);
+    LoadAligned64(b444[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+  __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0]);
+  sq_128[1][0] = SquareLo8(s[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, &b5_0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+  ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+                        x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+                        scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+                        sq, ma3, b3, ma5, b5);
+    Prepare3_8(ma3[0], ma3x[0]);
+    Prepare3_8(ma3[1], ma3x[1]);
+    Prepare3_8(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    ma[0][2] = Sum565Hi(ma5x);
+    mat[0][1] = ma[0][2];
+    StoreAligned64(ma565[1] + x, ma[0] + 1);
+    Sum565W(b5, b[0][1]);
+    StoreAligned64(b565[1] + x, b[0][1]);
+    const __m256i sr0 = LoadUnaligned32(src + x);
+    const __m256i sr1 = LoadUnaligned32(src + stride + x);
+    const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+    ma[0][0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned32(ma343[0] + x);
+    ma[1][1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[1][0]);
+    LoadAligned64(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned32(ma343[1] + x);
+    LoadAligned64(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Sum565W(b5 + 1, b[0][1]);
+    StoreAligned64(b565[1] + x + 16, b[0][1]);
+    Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+    mat[0][0] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]);
+    mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1][0]);
+    LoadAligned64(b444[0] + x + 16, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]);
+    const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+    LoadAligned64(b343[1] + x + 16, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]);
+    const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+    StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444[3],
+    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
+  __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  ma3[0] = SetrM128i(ma3_0, ma3_0);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0] = SetrM128i(b3_0, b3_0);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3];
+    BoxFilterPreProcessLastRow(src0 + x + 8,
+                               x + 8 + kOverreadInBytesPass1_256 - width,
+                               sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8(ma3, ma3x);
+    Prepare3_8(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565W(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343W(b3, b[2]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    mat[1] = Sum565Hi(ma5x);
+    Sum565W(b5 + 1, b[1]);
+    mat[2] = Sum343Hi(ma3x);
+    Sum343W(b3 + 1, b[2]);
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    mat[0] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
+    mat[0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[0]);
+    LoadAligned64(b444[0] + x + 16, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
+    const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    ma3[0] = ma3[2];
+    ma5[0] = ma5[2];
+    b3[0] = b3[2];
+    b5[0] = b5[2];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const uint8_t* const top_border, const uint8_t* bottom_border,
+    const ptrdiff_t stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, stride, width, sum_stride, temp_stride, sum3[0], sum5[1],
+         square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444, ma565[0], b343,
+                         b444, b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
+                     w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
+                     ma565, b343, b444, b565, dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src,
+                                  const uint8_t* const top_border,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t stride, const int width,
+                                  const int height, SgrBuffer* const sgr_buffer,
+                                  uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, stride, width, sum_stride, temp_stride, sum5[1],
+            square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
+                          w0, sum5, square_sum5, ma565[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src,
+                                  const uint8_t* const top_border,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t stride, const int width,
+                                  const int height, SgrBuffer* const sgr_buffer,
+                                  uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, stride, width, sum_stride, temp_stride, sum3[0],
+            square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const void* const top_border, const void* const bottom_border,
+    const ptrdiff_t stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
+                          stride, width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
+                          stride, width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
+                     width, height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_AVX2(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h
new file mode 100644
index 0000000..d80227c
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_avx2.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_AVX2();
+void LoopRestorationInit10bpp_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
new file mode 100644
index 0000000..24f5ad2
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -0,0 +1,2549 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit =
+      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+  const __m128i offsets = _mm_set1_epi16(-offset);
+  const __m128i limits = _mm_set1_epi16(limit - offset);
+  // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
+  const __m128i sum = _mm_add_epi16(s[0], s[1]);
+  const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
+  // Add back scaled down offset correction.
+  const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
+  const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
+  const __m128i d1 = _mm_min_epi16(d0, limits);
+  StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m128i s[4],
+                                       const __m128i filter[4],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[4];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+  madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
+  madds[0] = _mm_add_epi16(madds[0], madds[2]);
+  madds[1] = _mm_add_epi16(madds[1], madds[3]);
+  const __m128i s_3x128 =
+      _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m128i s[5],
+                                       const __m128i filter[3],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[3];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+  madds[0] = _mm_add_epi16(madds[0], madds[2]);
+  const __m128i s_3x128 =
+      _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m128i s[2],
+                                       const __m128i filter[2],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[2];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  const __m128i s_3x128 =
+      _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+// loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient0,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[4];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
+  filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+  filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
+  filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[7], ss[4];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      s[5] = LoadUnaligned16(src + x + 5);
+      s[6] = LoadUnaligned16(src + x + 6);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+      ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+      ss[3] = _mm_unpacklo_epi8(s[6], round);
+      WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+      ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
+      ss[3] = _mm_unpackhi_epi8(s[6], round);
+      WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient1,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[3];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
+  filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
+  filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[5], ss[3];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+      ss[2] = _mm_unpacklo_epi8(s[4], round);
+      WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+      ss[2] = _mm_unpackhi_epi8(s[4], round);
+      WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient2,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+  filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[3], ss[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], round);
+      WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], round);
+      WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m128i s = LoadUnaligned16(src + x);
+      const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
+      const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
+      const __m128i d0 = _mm_slli_epi16(s0, 4);
+      const __m128i d1 = _mm_slli_epi16(s1, 4);
+      StoreAligned16(*wiener_buffer + x + 0, d0);
+      StoreAligned16(*wiener_buffer + x + 8, d1);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum0 = _mm_add_epi32(round, madd0);
+  const __m128i sum1 = _mm_add_epi32(sum0, madd1);
+  return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum = _mm_add_epi32(madd0, madd1);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m128i madd = _mm_madd_epi16(a, filter);
+  const __m128i sum = _mm_add_epi32(round, madd);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+                                     const __m128i filter[2]) {
+  __m128i b[2];
+  const __m128i a06 = _mm_add_epi16(a[0], a[6]);
+  const __m128i a15 = _mm_add_epi16(a[1], a[5]);
+  const __m128i a24 = _mm_add_epi16(a[2], a[4]);
+  b[0] = _mm_unpacklo_epi16(a06, a15);
+  b[1] = _mm_unpacklo_epi16(a24, a[3]);
+  const __m128i sum0 = WienerVertical7(b, filter);
+  b[0] = _mm_unpackhi_epi16(a06, a15);
+  b[1] = _mm_unpackhi_epi16(a24, a[3]);
+  const __m128i sum1 = WienerVertical7(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[2];
+  const __m128i a04 = _mm_add_epi16(a[0], a[4]);
+  const __m128i a13 = _mm_add_epi16(a[1], a[3]);
+  b[0] = _mm_unpacklo_epi16(a04, a13);
+  b[1] = _mm_unpacklo_epi16(a[2], round);
+  const __m128i sum0 = WienerVertical5(b, filter);
+  b[0] = _mm_unpackhi_epi16(a04, a13);
+  b[1] = _mm_unpackhi_epi16(a[2], round);
+  const __m128i sum1 = WienerVertical5(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
+  __m128i b;
+  const __m128i a02 = _mm_add_epi16(a[0], a[2]);
+  b = _mm_unpacklo_epi16(a02, a[1]);
+  const __m128i sum0 = WienerVertical3(b, filter);
+  b = _mm_unpackhi_epi16(a02, a[1]);
+  const __m128i sum1 = WienerVertical3(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[7]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[5]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter, __m128i a[3]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter[2], __m128i d[2]) {
+  __m128i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter[2], __m128i d[2]) {
+  __m128i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter, __m128i d[2]) {
+  __m128i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi32(c, 0x55);
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[7];
+      const __m128i d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = Load4(coefficients);
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(c, 0);
+  filter[1] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[5];
+      const __m128i d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i filter =
+      _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[3];
+      const __m128i d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const __m128i a0 = LoadAligned16(wiener_buffer + 0);
+  const __m128i a1 = LoadAligned16(wiener_buffer + 8);
+  const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
+  const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
+  const __m128i c0 = _mm_srai_epi16(b0, 4);
+  const __m128i c1 = _mm_srai_epi16(b1, 4);
+  const __m128i d = _mm_packus_epi16(c0, c1);
+  StoreAligned16(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
+                         const void* const source, const void* const top_border,
+                         const void* const bottom_border,
+                         const ptrdiff_t stride, const int width,
+                         const int height,
+                         RestorationBuffer* const restoration_buffer,
+                         void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  const int16_t* const filter_horizontal =
+      restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+  const __m128i c = LoadLo8(filter_horizontal);
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  const __m128i coefficients_horizontal =
+      _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+                         wiener_stride, height_extra, filter_horizontal[0],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal[0], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+                         filter_horizontal[0], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+                         wiener_stride, height_extra, filter_horizontal[1],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal[1], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+                         filter_horizontal[1], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+                         wiener_stride, height_extra, filter_horizontal[2],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal[2], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+                         filter_horizontal[2], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+                         wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 10;
+constexpr int kOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+  const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+  dst[3] = _mm_srli_si128(src, 3);
+  dst[4] = _mm_srli_si128(src, 4);
+}
+
+template <int offset>
+inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+  const __m128i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+  const __m128i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+  const __m128i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlLo8(src[0], src[1]);
+  const __m128i sum23 = VaddlLo8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m128i Sum5WHi16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlHi8(src[0], src[1]);
+  const __m128i sum23 = VaddlHi8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+  __m128i s[3];
+  Prepare3Lo8(src, s);
+  return Sum3WLo16(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  return Sum5WLo16(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
+                           __m128i* const dst1) {
+  __m128i s[5];
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  const __m128i sum04 = VaddlLo8(s[0], s[4]);
+  *row3 = Sum3WLo16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+template <int offset>
+void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
+                   __m128i* const row3_1, __m128i* const row5_0,
+                   __m128i* const row5_1) {
+  __m128i s[5];
+  Prepare5_8<offset>(src, s);
+  const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
+  const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
+  *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+                          __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                          __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WLo16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WHi16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343WLo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwLo16(sum3, src[1]);
+}
+
+inline __m128i Sum343WHi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum343WLo(s);
+  dst[1] = Sum343WHi(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565WLo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, src[1]);
+}
+
+inline __m128i Sum565WHi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum565WLo(s);
+  dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    __m128i s[2], sq[3];
+    s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      x -= 16;
+      src += 16;
+      s[1] = LoadUnaligned16Msan(src,
+                                 sum_width - x + kOverreadInBytesPass1 - width);
+      sq[1] = SquareHi8(s[0]);
+      sq[2] = SquareLo8(s[1]);
+      SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[1];
+      sq[0] = sq[2];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  constexpr int kOverreadInBytes =
+      (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
+  int y = 2;
+  do {
+    __m128i s[2], sq[3];
+    s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row[2], row_sq[4];
+      x -= 16;
+      src += 16;
+      s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
+      sq[1] = SquareHi8(s[0]);
+      sq[2] = SquareLo8(s[1]);
+      if (size == 3) {
+        Sum3Horizontal<0>(s, row);
+        Sum3WHorizontal(sq + 0, row_sq + 0);
+        Sum3WHorizontal(sq + 1, row_sq + 2);
+      } else {
+        Sum5Horizontal<0>(s, &row[0], &row[1]);
+        Sum5WHorizontal(sq + 0, row_sq + 0);
+        Sum5WHorizontal(sq + 1, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[1];
+      sq[0] = sq[2];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
+  static_assert(n == 9 || n == 25, "");
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i* const b) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  __m128i maq;
+  if (offset == 0) {
+    maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  } else {
+    maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  }
+  *b = CalculateB<n>(sum, maq);
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparision instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i* const b0,
+                                  __m128i* const b1) {
+  // Use table lookup to read elements which indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparision instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements which indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements which indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparision instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements which indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  *b0 = CalculateB<9>(sum[0], maq0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  *b1 = CalculateB<9>(sum[1], maq1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i ma[2], __m128i b[2]) {
+  __m128i mas;
+  CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
+  ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+  ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  static_assert(offset == 0 || offset == 8, "");
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
+                         __m128i sum_b343[2], __m128i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m128i b[3], sum_b111[2];
+  Prepare3_16(b3, b);
+  sum_b111[0] = Sum3WLo32(b);
+  sum_b111[1] = Sum3WHi32(b);
+  sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+  sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+  StoreAligned32U32(b444 + x, sum_b444);
+  sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+  sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[0][3] = Sum5Horizontal(s[0][0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal(s[1][0]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Sum5WHorizontal(sq[0] + 1, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5WHorizontal(sq[1] + 1, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  Sum5WHorizontal(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5WHorizontal(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[5], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  s5[3] = s5[4] = Sum5Horizontal(s);
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[2] = SquareLo8(s[1]);
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5WHorizontal(sq + 1, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+  sq[3] = SquareHi8(s[1]);
+  Sum5WHorizontal(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s3[3], sq3[3][2];
+  sq[1] = SquareHi8(s);
+  s3[2] = Sum3Horizontal(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3WHorizontal(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s3[4], sq3[3][2], sum[2], index[2];
+  sq[2] = SquareLo8(s[1]);
+  Sum3Horizontal<8>(s, s3 + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3WHorizontal(sq + 1, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  sq[3] = SquareHi8(s[1]);
+  Sum3WHorizontal(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
+    __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
+  SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
+    __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
+  __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+    __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  SumHorizontalLo(s, &s3[2], &s5[3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
+    __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
+  __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+  sq[2] = SquareLo8(s[1]);
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  sq[3] = SquareHi8(s[1]);
+  SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 1);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  __m128i s[2][2], mas[2], sq[2][4], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma5[3], ma[2], b[4];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565W(bs + 0, b + 0);
+    Sum565W(bs + 1, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    s[1] = LoadUnaligned16Msan(src + x + 16,
+                               x + 16 + kOverreadInBytesPass2 - width);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      __m128i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343W(bs + 0, b + 0);
+      Sum343W(bs + 1, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
+    uint32_t* const b444[2], uint32_t* b565) {
+  __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], b[4], ma3x[3], ma5x[3];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343W(b3[0] + 0, b + 0);
+    Sum343W(b3[0] + 1, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565W(b5 + 0, b + 0);
+    Sum565W(b5 + 1, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1],
+                   b444[0]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m128i v = _mm_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+                                       const __m128i b[2]) {
+  const __m128i ma_x_src_lo = VmullLo16(ma, src);
+  const __m128i ma_x_src_hi = VmullHi16(ma, src);
+  const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
+                                            __m128i b[2][2]) {
+  const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+  __m128i b_sum[2];
+  b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3],
+                                            __m128i b[3][2]) {
+  const __m128i ma_sum = Sum3_16(ma);
+  __m128i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+  const __m128i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+  return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+                                          const __m128i filter[2], const int w0,
+                                          const int w2) {
+  __m128i v[2];
+  const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+  const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+                                          const __m128i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m128i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2][2], mas[2], sq[2][4], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma3[3], b[2][2], sr[2], p[2];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma3);
+    ma[1] = Sum565Lo(ma3);
+    StoreAligned16(ma565[1] + x, ma[1]);
+    Sum565W(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    sr[0] = LoadAligned16(src + x);
+    sr[1] = LoadAligned16(src + stride + x);
+    const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
+    const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma3);
+    StoreAligned16(ma565[1] + x + 8, ma[1]);
+    Sum565W(bs + 1, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
+    const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+    const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint8_t* const dst) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2];
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565W(bs, b[1]);
+    ma[0] = LoadAligned16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565W(bs + 1, b[1]);
+    ma[0] = LoadAligned16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint8_t* const dst) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass2 - width);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma[3], b[3][2], ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    StoreAligned16(ma565[1] + x, ma[0][1]);
+    Sum565W(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const __m128i sr0 = LoadAligned16(src + x);
+    const __m128i sr1 = LoadAligned16(src + stride + x);
+    const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
+    const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
+    ma[0][0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x);
+    ma[1][1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+    Sum565W(b5 + 1, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
+    const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
+    ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+    ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444[3],
+    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
+                               square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
+                               &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma3x[3], ma5x[3], p[2];
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565W(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343W(b3, b[2]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565W(b5 + 1, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343W(b3 + 1, b[2]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[2];
+    b5[0] = b5[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const uint8_t* const top_border, const uint8_t* bottom_border,
+    const ptrdiff_t stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1],
+         square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444, ma565[0], b343,
+                         b444, b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
+                     w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
+                     ma565, b343, b444, b565, dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src,
+                                  const uint8_t* const top_border,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t stride, const int width,
+                                  const int height, SgrBuffer* const sgr_buffer,
+                                  uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1],
+            square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
+                          w0, sum5, square_sum5, ma565[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src,
+                                  const uint8_t* const top_border,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t stride, const int width,
+                                  const int height, SgrBuffer* const sgr_buffer,
+                                  uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0],
+            square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const void* const top_border, const void* const bottom_border,
+    const ptrdiff_t stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
+                          stride, width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
+                          stride, width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
+                     width, height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+  static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+  static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h
new file mode 100644
index 0000000..65b2b11
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_sse4.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_SSE4_1();
+void LoopRestorationInit10bpp_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
new file mode 100644
index 0000000..d8036be
--- /dev/null
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -0,0 +1,447 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Width can only be 4 when it is subsampled from a block of width 8, hence
+// subsampling_x is always 1 when this function is called.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+    const __m128i mask_val_1 =
+        _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+    if (subsampling_y == 1) {
+      const __m128i next_mask_val_0 =
+          _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
+      const __m128i next_mask_val_1 =
+          _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
+      subsampled_mask = _mm_add_epi16(
+          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+    }
+    return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+  }
+  const __m128i mask_val_0 = Load4(mask);
+  const __m128i mask_val_1 = Load4(mask + mask_stride);
+  return _mm_cvtepu8_epi16(
+      _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+// This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
+// 16-bit is also the lowest packing for hadd, but without subsampling there is
+// an unfortunate conversion required.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) {
+  if (subsampling_x == 1) {
+    const __m128i row_vals = LoadUnaligned16(mask);
+
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+    if (subsampling_y == 1) {
+      const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+      const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+      const __m128i next_mask_val_1 =
+          _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+      subsampled_mask = _mm_add_epi16(
+          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+    }
+    return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) {
+  if (subsampling_x == 1) {
+    const __m128i row_vals = LoadUnaligned16(mask);
+
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+    if (subsampling_y == 1) {
+      const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+      const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+      const __m128i next_mask_val_1 =
+          _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+      subsampled_mask = _mm_add_epi16(
+          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+    }
+    const __m128i ret =
+        RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+    return _mm_packus_epi16(ret, ret);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  // Unfortunately there is no shift operation for 8-bit packing, or else we
+  // could return everything with 8-bit packing.
+  const __m128i mask_val = LoadLo8(mask);
+  return mask_val;
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
+                                  const int16_t* const pred_1,
+                                  const __m128i pred_mask_0,
+                                  const __m128i pred_mask_1, uint8_t* dst,
+                                  const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadAligned16(pred_0);
+  const __m128i pred_val_1 = LoadAligned16(pred_1);
+  const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+  const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+  const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+  const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+  // int res = (mask_value * prediction_0[x] +
+  //      (64 - mask_value) * prediction_1[x]) >> 6;
+  const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+  const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+  const __m128i compound_pred = _mm_packus_epi32(
+      _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6));
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  const __m128i result = RightShiftWithRounding_S16(compound_pred, 4);
+  const __m128i res = _mm_packus_epi16(result, result);
+  Store4(dst, res);
+  Store4(dst + dst_stride, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1,
+                                 const uint8_t* mask,
+                                 const ptrdiff_t mask_stride, uint8_t* dst,
+                                 const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  __m128i pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += 4 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1,
+                                 const uint8_t* const mask_ptr,
+                                 const ptrdiff_t mask_stride, const int height,
+                                 uint8_t* dst, const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  int y = 0;
+  do {
+    __m128i pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1,
+                           const ptrdiff_t /*prediction_stride_1*/,
+                           const uint8_t* const mask_ptr,
+                           const ptrdiff_t mask_stride, const int width,
+                           const int height, void* dest,
+                           const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = width;
+  if (width == 4) {
+    MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+      const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+      const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+
+      const __m128i pred_val_0 = LoadAligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadAligned16(pred_1 + x);
+      const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+      const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+      // int res = (mask_value * prediction_0[x] +
+      //      (64 - mask_value) * prediction_1[x]) >> 6;
+      const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+      const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+
+      const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6),
+                                           _mm_srli_epi32(compound_pred_hi, 6));
+      // dst[x] = static_cast<Pixel>(
+      //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+      //           (1 << kBitdepth8) - 1));
+      const __m128i result = RightShiftWithRounding_S16(res, 4);
+      StoreLo8(dst + x, _mm_packus_epi16(result, result));
+
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
+                                                uint8_t* const pred_1,
+                                                const ptrdiff_t pred_stride_1,
+                                                const __m128i pred_mask_0,
+                                                const __m128i pred_mask_1) {
+  const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+  const __m128i pred_val_0 = LoadLo8(pred_0);
+  // TODO(b/150326556): One load.
+  __m128i pred_val_1 = Load4(pred_1);
+  pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
+                            pred_val_1);
+  const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+  // int res = (mask_value * prediction_1[x] +
+  //      (64 - mask_value) * prediction_0[x]) >> 6;
+  const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+  const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+  const __m128i res = _mm_packus_epi16(result, result);
+
+  Store4(pred_1, res);
+  Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0,
+                                               uint8_t* pred_1,
+                                               const ptrdiff_t pred_stride_1,
+                                               const uint8_t* mask,
+                                               const ptrdiff_t mask_stride) {
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const __m128i pred_mask_u16_first =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  mask += mask_stride << (1 + subsampling_y);
+  const __m128i pred_mask_u16_second =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  mask += mask_stride << (1 + subsampling_y);
+  __m128i pred_mask_1 =
+      _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
+  __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+
+  pred_mask_1 = _mm_srli_si128(pred_mask_1, 8);
+  pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0,
+                                               uint8_t* pred_1,
+                                               const ptrdiff_t pred_stride_1,
+                                               const uint8_t* const mask_ptr,
+                                               const ptrdiff_t mask_stride,
+                                               const int height) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    return;
+  }
+  int y = 0;
+  do {
+    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+
+    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0,
+                                  uint8_t* prediction_1,
+                                  const ptrdiff_t prediction_stride_1,
+                                  const uint8_t* const mask_ptr,
+                                  const ptrdiff_t mask_stride, const int width,
+                                  const int height) {
+  if (width == 4) {
+    InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
+        prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+        height);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_1 =
+          GetInterIntraMask8<subsampling_x, subsampling_y>(
+              mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+      const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+      const __m128i pred_val_0 = LoadLo8(prediction_0 + x);
+      const __m128i pred_val_1 = LoadLo8(prediction_1 + x);
+      const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+      // int res = (mask_value * prediction_1[x] +
+      //      (64 - mask_value) * prediction_0[x]) >> 6;
+      const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+      const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+      const __m128i res = _mm_packus_epi16(result, result);
+
+      StoreLo8(prediction_1 + x, res);
+
+      x += 8;
+    } while (x < width);
+    prediction_0 += width;
+    prediction_1 += prediction_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
+  dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
+  dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
+  dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
+#endif
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h
new file mode 100644
index 0000000..52b0b5c
--- /dev/null
+++ b/src/dsp/x86/mask_blend_sse4.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc
new file mode 100644
index 0000000..c506941
--- /dev/null
+++ b/src/dsp/x86/motion_field_projection_sse4.cc
@@ -0,0 +1,397 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i LoadDivision(const __m128i division_table,
+                            const __m128i reference_offset) {
+  const __m128i kOne = _mm_set1_epi16(0x0100);
+  const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
+  const __m128i tt = _mm_unpacklo_epi8(t, t);
+  const __m128i idx = _mm_add_epi8(tt, kOne);
+  return _mm_shuffle_epi8(division_table, idx);
+}
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+                            const int numerator) {
+  const __m128i m0 = _mm_madd_epi16(mv, denominator);
+  const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
+  // Add the sign (0 or -1) to round towards zero.
+  const __m128i sign = _mm_srai_epi32(m, 31);
+  const __m128i add_sign = _mm_add_epi32(m, sign);
+  const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+  return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
+                                const int numerator) {
+  const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
+  const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
+  const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
+  const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
+  const __m128i s0 = MvProjection(mv0, denorm0, numerator);
+  const __m128i s1 = MvProjection(mv1, denorm1, numerator);
+  const __m128i projection = _mm_packs_epi32(s0, s1);
+  const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+  const __m128i projection_mv_clamp_negative =
+      _mm_set1_epi16(-kProjectionMvClamp);
+  const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
+  return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
+  // Add 63 to negative delta so that it shifts towards zero.
+  const __m128i delta_sign = _mm_srai_epi16(delta, 15);
+  const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
+  const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
+  const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
+  const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
+  return _mm_sub_epi16(offset1, dst_sign);
+}
+
+inline void GetPosition(
+    const __m128i division_table, const MotionVector* const mv,
+    const int numerator, const int x8_start, const int x8_end, const int x8,
+    const __m128i& r_offsets, const __m128i& source_reference_type8,
+    const __m128i& skip_r, const __m128i& y8_floor8, const __m128i& y8_ceiling8,
+    const __m128i& d_sign, const int delta, __m128i* const r,
+    __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
+  const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+  *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
+  const __m128i denorm = LoadDivision(division_table, source_reference_type8);
+  __m128i projection_mv[2];
+  mvs[0] = LoadUnaligned16(mv_int + 0);
+  mvs[1] = LoadUnaligned16(mv_int + 4);
+  // Deinterlace x and y components
+  const __m128i kShuffle =
+      _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+  const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
+  const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
+  const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
+  const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
+  // numerator could be 0.
+  projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
+  projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
+  // Do not update the motion vector if the block position is not valid or
+  // if position_x8 is outside the current range of x8_start and x8_end.
+  // Note that position_y8 will always be within the range of y8_start and
+  // y8_end.
+  // After subtracting the base, valid projections are within 8-bit.
+  const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
+  const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
+  const __m128i positions = _mm_packs_epi16(position_x, position_y);
+  const __m128i k01234567 =
+      _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+  *position_xy = _mm_add_epi8(positions, k01234567);
+  const int x8_floor = std::max(
+      x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
+  const int x8_ceiling =
+      std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
+      1;  // [-1, 15]
+  const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
+  const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
+  const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
+  const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
+  const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
+  const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
+  const __m128i out = _mm_or_si128(underflow, overflow);
+  const __m128i skip_low = _mm_or_si128(skip_r, out);
+  const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
+  StoreLo8(skip_64, skip);
+}
+
+template <int idx>
+inline void Store(const __m128i position, const __m128i reference_offset,
+                  const __m128i mv, int8_t* dst_reference_offset,
+                  MotionVector* dst_mv) {
+  const ptrdiff_t offset =
+      static_cast<int16_t>(_mm_extract_epi16(position, idx));
+  if ((idx & 3) == 0) {
+    dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+  } else {
+    dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+  }
+  dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const __m128i position,
+                       const __m128i reference_offset, const __m128i mv,
+                       int8_t* dst_reference_offset, MotionVector* dst_mv) {
+  if (skips[idx] == 0) {
+    Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+  }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_SSE4_1(
+    const ReferenceInfo& reference_info,
+    const int reference_to_current_with_sign, const int dst_sign,
+    const int y8_start, const int y8_end, const int x8_start, const int x8_end,
+    TemporalMotionField* const motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+  const int leftover = adjusted_x8_end - adjusted_x8_end8;
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  const __m128i d_sign = _mm_set1_epi16(dst_sign);
+
+  static_assert(sizeof(int8_t) == sizeof(bool), "");
+  static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+  static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+  assert(dst_sign == 0 || dst_sign == -1);
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+  assert((adjusted_x8_start & 7) == 0);
+  // The final position calculation is represented with int16_t. Valid
+  // position_y8 from its base is at most 7. After considering the horizontal
+  // offset which is at most |stride - 1|, we have the following assertion,
+  // which means this optimization works for frame width up to 32K (each
+  // position is a 8x8 block).
+  assert(8 * stride <= 32768);
+  const __m128i skip_reference = LoadLo8(skip_references);
+  const __m128i r_offsets = LoadLo8(reference_offsets);
+  const __m128i division_table = LoadUnaligned16(projection_divisions);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;                             // [-7, 0]
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1;  // [0, 7]
+    const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
+    const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
+    int x8;
+
+    for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+      const __m128i source_reference_type8 =
+          LoadLo8(source_reference_types + x8);
+      const __m128i skip_r =
+          _mm_shuffle_epi8(skip_reference, source_reference_type8);
+      int64_t early_skip;
+      StoreLo8(&early_skip, skip_r);
+      // Early termination #1 if all are skips. Chance is typically ~30-40%.
+      if (early_skip == -1) continue;
+      int64_t skip_64;
+      __m128i r, position_xy, mvs[2];
+      GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+                  x8_end, x8, r_offsets, source_reference_type8, skip_r,
+                  y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
+                  mvs);
+      // Early termination #2 if all are skips.
+      // Chance is typically ~15-25% after Early termination #1.
+      if (skip_64 == -1) continue;
+      const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+      const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+      const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+      const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+      const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+      if (skip_64 == 0) {
+        // Store all. Chance is typically ~70-85% after Early termination #2.
+        Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+      } else {
+        // Check and store each.
+        // Chance is typically ~15-30% after Early termination #2.
+        // The compiler is smart enough to not create the local buffer skips[].
+        int8_t skips[8];
+        memcpy(skips, &skip_64, sizeof(skips));
+        CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+      }
+    }
+
+    // The following leftover processing cannot be moved out of the do...while
+    // loop. Doing so may change the result storing orders of the same position.
+    if (leftover > 0) {
+      // Use SIMD only when leftover is at least 4, and there are at least 8
+      // elements in a row.
+      if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+        // Process the last 8 elements to avoid loading invalid memory. Some
+        // elements may have been processed in the above loop, which is OK.
+        const int delta = 8 - leftover;
+        x8 = adjusted_x8_end - 8;
+        const __m128i source_reference_type8 =
+            LoadLo8(source_reference_types + x8);
+        const __m128i skip_r =
+            _mm_shuffle_epi8(skip_reference, source_reference_type8);
+        int64_t early_skip;
+        StoreLo8(&early_skip, skip_r);
+        // Early termination #1 if all are skips.
+        if (early_skip != -1) {
+          int64_t skip_64;
+          __m128i r, position_xy, mvs[2];
+          GetPosition(division_table, mv, reference_to_current_with_sign,
+                      x8_start, x8_end, x8, r_offsets, source_reference_type8,
+                      skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+                      &position_xy, &skip_64, mvs);
+          // Early termination #2 if all are skips.
+          if (skip_64 != -1) {
+            const __m128i p_y =
+                _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+            const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+            const __m128i p_y_offset =
+                _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+            const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+            const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+            // Store up to 7 elements since leftover is at most 7.
+            if (skip_64 == 0) {
+              // Store all.
+              Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+            } else {
+              // Check and store each.
+              // The compiler is smart enough to not create the local buffer
+              // skips[].
+              int8_t skips[8];
+              memcpy(skips, &skip_64, sizeof(skips));
+              CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+            }
+          }
+        }
+      } else {
+        for (; x8 < adjusted_x8_end; ++x8) {
+          const int source_reference_type = source_reference_types[x8];
+          if (skip_references[source_reference_type]) continue;
+          MotionVector projection_mv;
+          // reference_to_current_with_sign could be 0.
+          GetMvProjection(mv[x8], reference_to_current_with_sign,
+                          projection_divisions[source_reference_type],
+                          &projection_mv);
+          // Do not update the motion vector if the block position is not valid
+          // or if position_x8 is outside the current range of x8_start and
+          // x8_end. Note that position_y8 will always be within the range of
+          // y8_start and y8_end.
+          const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+          if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
+          const int x8_base = x8 & ~7;
+          const int x8_floor =
+              std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+          const int x8_ceiling =
+              std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+          const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+          if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+          dst_mv[position_y8 * stride + position_x8] = mv[x8];
+          dst_reference_offset[position_y8 * stride + position_x8] =
+              reference_offsets[source_reference_type];
+        }
+      }
+    }
+
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+#endif
+
+}  // namespace
+
+void MotionFieldProjectionInit_SSE4_1() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_field_projection_sse4.h b/src/dsp/x86/motion_field_projection_sse4.h
new file mode 100644
index 0000000..c05422c
--- /dev/null
+++ b/src/dsp/x86/motion_field_projection_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
new file mode 100644
index 0000000..e9cdd4c
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.cc
@@ -0,0 +1,262 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+    0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+    1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
+    744,  712,   682,  655,  630,  606,  585,  564,  546,  528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+                            const __m128i numerator) {
+  const __m128i m0 = _mm_madd_epi16(mv, denominator);
+  const __m128i m = _mm_mullo_epi32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const __m128i sign = _mm_srai_epi32(m, 31);
+  const __m128i add_sign = _mm_add_epi32(m, sign);
+  const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+  return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+                                const __m128i denominators[2],
+                                const __m128i numerator) {
+  const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+  const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+  const __m128i mv = _mm_packs_epi32(s0, s1);
+  const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+  const __m128i projection_mv_clamp_negative =
+      _mm_set1_epi16(-kProjectionMvClamp);
+  const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+  return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+    const MotionVector* const temporal_mvs,
+    const int8_t temporal_reference_offsets[2],
+    const int reference_offsets[2]) {
+  const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+  const __m128i temporal_mv = LoadLo8(tmvs);
+  const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+  __m128i mvs[2], denominators[2];
+  mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+  mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+  denominators[0] = _mm_set1_epi32(
+      kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+  denominators[1] = _mm_set1_epi32(
+      kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+  const __m128i offsets = LoadLo8(reference_offsets);
+  const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+  return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets,
+    const int reference_offset) {
+  const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+  const __m128i temporal_mv = LoadAligned16(tmvs);
+  __m128i lookup = _mm_cvtsi32_si128(
+      kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+      1);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+      2);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+      3);
+  __m128i mvs[2], denominators[2];
+  mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+  mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+  denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+  denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+  const __m128i numerator = _mm_set1_epi32(reference_offset);
+  return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+  const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+  const __m128i sign = _mm_srai_epi16(mv, 15);
+  const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+  const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+  StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+  const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+  const __m128i sign = _mm_srai_epi16(mv, 15);
+  const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+  const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+  const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+  StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    LowPrecision(mv, candidate_mvs + i);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    ForceInteger(mv, candidate_mvs + i);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    StoreAligned16(candidate_mvs + i, mv);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    LowPrecision(mv, candidate_mvs + i);
+    i += 4;
+  } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    ForceInteger(mv, candidate_mvs + i);
+    i += 4;
+  } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offset, const int count, MotionVector* candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    StoreAligned16(candidate_mvs + i, mv);
+    i += 4;
+  } while (i < count);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+#endif
+
+}  // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_vector_search_sse4.h b/src/dsp/x86/motion_vector_search_sse4.h
new file mode 100644
index 0000000..d65b392
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
new file mode 100644
index 0000000..3a1d1fd
--- /dev/null
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -0,0 +1,329 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+  int y = height;
+  do {
+    const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
+    const __m128i obmc_pred_val =
+        Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
+
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store2(pred, packed_result);
+    pred += prediction_stride;
+    const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
+    memcpy(pred, &second_row_result, sizeof(second_row_result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = Load4(kObmcMask + 2);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  // Duplicate first half of vector.
+  const __m128i masks =
+      _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
+  int y = height;
+  do {
+    const __m128i pred_val0 = Load4(pred);
+    const __m128i obmc_pred_val0 = Load4(obmc_pred);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    // Place the second row of each source in the second four bytes.
+    const __m128i pred_val =
+        _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+    const __m128i obmc_pred_val = _mm_alignr_epi8(
+        Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store4(pred - prediction_stride, packed_result);
+    const int second_row_result = _mm_extract_epi32(packed_result, 1);
+    memcpy(pred, &second_row_result, sizeof(second_row_result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft8xH_SSE4_1(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const __m128i mask_val = LoadLo8(kObmcMask + 6);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+  int y = height;
+  do {
+    const __m128i pred_val = LoadLo8(pred);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+    StoreLo8(pred, _mm_packus_epi16(result, result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (--y != 0);
+}
+
+void OverlapBlendFromLeft_SSE4_1(void* const prediction,
+                                 const ptrdiff_t prediction_stride,
+                                 const int width, const int height,
+                                 const void* const obmc_prediction,
+                                 const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                   obmc_prediction_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                   obmc_prediction_stride);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                   obmc_prediction_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint8_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+    const __m128i mask_val = LoadUnaligned16(mask + x);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
+
+    int y = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+      const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+      const __m128i result_lo =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
+      const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+      const __m128i result_hi =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
+      StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
+
+      pred += prediction_stride;
+      obmc_pred += obmc_prediction_stride;
+    } while (++y < height);
+    x += 16;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(
+        _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
+        mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i pred_val0 = Load4(pred);
+
+    const __m128i obmc_pred_val0 = Load4(obmc_pred);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    const __m128i pred_val =
+        _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+    const __m128i obmc_pred_val = _mm_alignr_epi8(
+        Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+    const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store4(pred - prediction_stride, packed_result);
+    Store4(pred, _mm_srli_si128(packed_result, 4));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    y += 2;
+  } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop8xH_SSE4_1(
+    uint8_t* const prediction, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const uint8_t* mask = kObmcMask + height - 2;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const int compute_height = height - (height >> 2);
+  int y = compute_height;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i pred_val = LoadLo8(pred);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+    StoreLo8(pred, _mm_packus_epi16(result, result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (--y != 0);
+}
+
+void OverlapBlendFromTop_SSE4_1(void* const prediction,
+                                const ptrdiff_t prediction_stride,
+                                const int width, const int height,
+                                const void* const obmc_prediction,
+                                const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+  if (width <= 4) {
+    OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                  obmc_prediction_stride);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                  obmc_prediction_stride);
+    return;
+  }
+
+  // Stop when mask value becomes 64.
+  const int compute_height = height - (height >> 2);
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  int y = 0;
+  const uint8_t* mask = kObmcMask + height - 2;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    int x = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred + x);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+      const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+      const __m128i result_lo =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+      const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+      const __m128i result_hi =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
+      StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
+      x += 16;
+    } while (x < width);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y < compute_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void ObmcInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h
new file mode 100644
index 0000000..bd8b416
--- /dev/null
+++ b/src/dsp/x86/obmc_sse4.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend[]. This function is not thread-safe.
+void ObmcInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
new file mode 100644
index 0000000..b2bdfd2
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.cc
@@ -0,0 +1,166 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+    kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, -128, 0, 0, 0, 0},       {0, 0, 1, -128, -2, 1, 0, 0},
+        {0, -1, 3, -127, -4, 2, -1, 0},    {0, -1, 4, -127, -6, 3, -1, 0},
+        {0, -2, 6, -126, -8, 3, -1, 0},    {0, -2, 7, -125, -11, 4, -1, 0},
+        {1, -2, 8, -125, -13, 5, -2, 0},   {1, -3, 9, -124, -15, 6, -2, 0},
+        {1, -3, 10, -123, -18, 6, -2, 1},  {1, -3, 11, -122, -20, 7, -3, 1},
+        {1, -4, 12, -121, -22, 8, -3, 1},  {1, -4, 13, -120, -25, 9, -3, 1},
+        {1, -4, 14, -118, -28, 9, -3, 1},  {1, -4, 15, -117, -30, 10, -4, 1},
+        {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+        {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+        {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+        {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+        {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+        {1, -6, 20, -97, -58, 17, -6, 1},  {1, -6, 20, -95, -61, 18, -6, 1},
+        {2, -7, 20, -93, -64, 18, -6, 2},  {2, -7, 20, -91, -66, 19, -6, 1},
+        {2, -7, 20, -88, -69, 19, -6, 1},  {2, -7, 20, -86, -71, 19, -6, 1},
+        {2, -7, 20, -84, -74, 20, -7, 2},  {2, -7, 20, -81, -76, 20, -7, 1},
+        {2, -7, 20, -79, -79, 20, -7, 2},  {1, -7, 20, -76, -81, 20, -7, 2},
+        {2, -7, 20, -74, -84, 20, -7, 2},  {1, -6, 19, -71, -86, 20, -7, 2},
+        {1, -6, 19, -69, -88, 20, -7, 2},  {1, -6, 19, -66, -91, 20, -7, 2},
+        {2, -6, 18, -64, -93, 20, -7, 2},  {1, -6, 18, -61, -95, 20, -6, 1},
+        {1, -6, 17, -58, -97, 20, -6, 1},  {1, -6, 17, -56, -99, 20, -6, 1},
+        {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+        {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+        {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+        {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+        {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+        {1, -3, 9, -28, -118, 14, -4, 1},  {1, -3, 9, -25, -120, 13, -4, 1},
+        {1, -3, 8, -22, -121, 12, -4, 1},  {1, -3, 7, -20, -122, 11, -3, 1},
+        {1, -2, 6, -18, -123, 10, -3, 1},  {0, -2, 6, -15, -124, 9, -3, 1},
+        {0, -2, 5, -13, -125, 8, -2, 1},   {0, -1, 4, -11, -125, 7, -2, 0},
+        {0, -1, 3, -8, -126, 6, -2, 0},    {0, -1, 3, -6, -127, 4, -1, 0},
+        {0, -1, 2, -4, -127, 3, -1, 0},    {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 4);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 16) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      remainder = subpixel_x & kSuperResScaleMask;
+      filter = LoadHi8(filter,
+                       kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+                     const ptrdiff_t stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* const dest) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 15 extra upscaled
+    // pixels which will over-read up to 15 downscaled pixels in the end of each
+    // row. kSuperResHorizontalBorder accounts for this.
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 16) {
+        __m128i s = LoadLo8(&src[subpixel_x >> kSuperResScaleBits]);
+        subpixel_x += step;
+        s = LoadHi8(s, &src[subpixel_x >> kSuperResScaleBits]);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_maddubs_epi16(s, f);
+      }
+
+      __m128i a[4];
+      a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+      Transpose2x16_U16(a, a);
+      a[0] = _mm_adds_epi16(a[0], a[1]);
+      a[1] = _mm_adds_epi16(a[2], a[3]);
+      const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+      a[0] = _mm_subs_epi16(rounding, a[0]);
+      a[1] = _mm_subs_epi16(rounding, a[1]);
+      a[0] = _mm_srai_epi16(a[0], kFilterBits);
+      a[1] = _mm_srai_epi16(a[1], kFilterBits);
+      StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += stride;
+    dst += stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+  dsp->super_res = SuperRes_SSE4_1;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h
new file mode 100644
index 0000000..aef5147
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res_row. This function is not thread-safe.
+void SuperResInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h
new file mode 100644
index 0000000..208b301
--- /dev/null
+++ b/src/dsp/x86/transpose_sse4.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <emmintrin.h>
+
+namespace libgav1 {
+namespace dsp {
+
+LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
+                                             __m128i* const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]:  00 01 10 11  20 21 30 31
+  // in[0]:  40 41 50 51  60 61 70 71
+  // in[0]:  80 81 90 91  a0 a1 b0 b1
+  // in[0]:  c0 c1 d0 d1  e0 e1 f0 f1
+  // to:
+  // a0:     00 40 01 41  10 50 11 51
+  // a1:     20 60 21 61  30 70 31 71
+  // a2:     80 c0 81 c1  90 d0 91 d1
+  // a3:     a0 e0 a1 e1  b0 f0 b1 f1
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]);
+  // b0:     00 20 40 60  01 21 41 61
+  // b1:     10 30 50 70  11 31 51 71
+  // b2:     80 a0 c0 e0  81 a1 c1 e1
+  // b3:     90 b0 d0 f0  91 b1 d1 f1
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 80 90 a0 b0  c0 d0 e0 f0
+  // out[3]: 81 91 a1 b1  c1 d1 e1 f1
+  out[0] = _mm_unpacklo_epi16(b0, b1);
+  out[1] = _mm_unpackhi_epi16(b0, b1);
+  out[2] = _mm_unpacklo_epi16(b2, b3);
+  out[3] = _mm_unpackhi_epi16(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in,
+                                                 __m128i* out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]:  00 01 02 03 04 05 06 07
+  // in[1]:  10 11 12 13 14 15 16 17
+  // in[2]:  20 21 22 23 24 25 26 27
+  // in[3]:  30 31 32 33 34 35 36 37
+  // in[4]:  40 41 42 43 44 45 46 47
+  // in[5]:  50 51 52 53 54 55 56 57
+  // in[6]:  60 61 62 63 64 65 66 67
+  // in[7]:  70 71 72 73 74 75 76 77
+  // to:
+  // a0:     00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
+  // a1:     20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
+  // a2:     40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
+  // a3:     60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // b0:     00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  // b1:     40 50 60 70  41 51 61 71  42 52 62 72  43 53 63 73
+  // b2:     04 14 24 34  05 15 25 35  06 16 26 36  07 17 27 37
+  // b3:     44 54 64 74  45 55 65 75  46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // out[0]: 00 10 20 30  40 50 60 70  01 11 21 31  41 51 61 71
+  // out[1]: 02 12 22 32  42 52 62 72  03 13 23 33  43 53 63 73
+  // out[2]: 04 14 24 34  44 54 64 74  05 15 25 35  45 55 65 75
+  // out[3]: 06 16 26 36  46 56 66 76  07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi32(b0, b1);
+  out[1] = _mm_unpackhi_epi32(b0, b1);
+  out[2] = _mm_unpacklo_epi32(b2, b3);
+  out[3] = _mm_unpackhi_epi32(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4_U16(const __m128i* in, __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // ba:    00 10 01 11  02 12 03 13
+  // dc:    20 30 21 31  22 32 23 33
+  const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+  // Unpack 32 bit elements resulting in:
+  // dcba_lo: 00 10 20 30  01 11 21 31
+  // dcba_hi: 02 12 22 32  03 13 23 33
+  const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+  const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+  // Assign or shift right by 8 bytes resulting in:
+  // out[0]: 00 10 20 30  01 11 21 31
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  03 13 23 33
+  // out[3]: 03 13 23 33  XX XX XX XX
+  out[0] = dcba_lo;
+  out[1] = _mm_srli_si128(dcba_lo, 8);
+  out[2] = dcba_hi;
+  out[3] = _mm_srli_si128(dcba_hi, 8);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4_U16(const __m128i* in,
+                                                 __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8_U16(const __m128i* in,
+                                                 __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8_U16(const __m128i* const in,
+                                            __m128i* const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
new file mode 100644
index 0000000..43279ab
--- /dev/null
+++ b/src/dsp/x86/warp_sse4.cc
@@ -0,0 +1,525 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// This assumes the two filters contain filter[x] and filter[x+2].
+inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
+                                const __m128i filter_1,
+                                const __m128i& src_window) {
+  const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
+  const __m128i src =
+      _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
+  return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
+}
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+    (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+inline void HorizontalFilter(const int sx4, const int16_t alpha,
+                             const __m128i src_row,
+                             int16_t intermediate_result_row[8]) {
+  int sx = sx4 - MultiplyBy4(alpha);
+  __m128i filter[8];
+  for (__m128i& f : filter) {
+    const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                       kWarpedPixelPrecisionShifts;
+    f = LoadLo8(kWarpedFilters8[offset]);
+    sx += alpha;
+  }
+  Transpose8x8To4x16_U8(filter, filter);
+  // |filter| now contains two filters per register.
+  // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
+  // without overflowing the sign bit. The sign bit is hit only where two taps
+  // paired in a single madd add up to more than 128. This is only possible with
+  // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
+  // even guarantees safety. |sum| is given a negative offset to allow for large
+  // intermediate values.
+  // k = 0, 2.
+  __m128i src_row_window = src_row;
+  __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
+  sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
+
+  // k = 1, 3.
+  src_row_window = _mm_srli_si128(src_row_window, 1);
+  sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
+                         _mm_srli_si128(filter[1], 8), src_row_window);
+  // k = 4, 6.
+  src_row_window = _mm_srli_si128(src_row_window, 3);
+  sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
+
+  // k = 5, 7.
+  src_row_window = _mm_srli_si128(src_row_window, 1);
+  sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
+                         _mm_srli_si128(filter[3], 8), src_row_window);
+
+  sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
+  StoreUnaligned16(intermediate_result_row, sum);
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+                                const int16_t intermediate_result[15][8], int y,
+                                void* dst_row) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
+  __m128i sum_high = sum_low;
+  for (int k = 0; k < 8; k += 2) {
+    const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+    const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+    const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
+    const __m128i intermediate_1 =
+        LoadUnaligned16(intermediate_result[y + k + 1]);
+    const __m128i intermediate_low =
+        _mm_unpacklo_epi16(intermediate_0, intermediate_1);
+    const __m128i intermediate_high =
+        _mm_unpackhi_epi16(intermediate_0, intermediate_1);
+
+    const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
+    const __m128i product_high =
+        _mm_madd_epi16(filters_high, intermediate_high);
+    sum_low = _mm_add_epi32(sum_low, product_low);
+    sum_high = _mm_add_epi32(sum_high, product_high);
+  }
+  sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+  sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+  if (is_compound) {
+    const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+    StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+  } else {
+    const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+    StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+  }
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+                                const int16_t* intermediate_result_column,
+                                void* dst_row) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  __m128i sum_low = _mm_setzero_si128();
+  __m128i sum_high = _mm_setzero_si128();
+  for (int k = 0; k < 8; k += 2) {
+    const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+    const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+    // Equivalent to unpacking two vectors made by duplicating int16_t values.
+    const __m128i intermediate =
+        _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
+                       intermediate_result_column[k]);
+    const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
+    const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
+    sum_low = _mm_add_epi32(sum_low, product_low);
+    sum_high = _mm_add_epi32(sum_high, product_high);
+  }
+  sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+  sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+  if (is_compound) {
+    const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+    StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+  } else {
+    const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+    StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+                           int delta, DestType* dest_row,
+                           ptrdiff_t dest_stride) {
+  int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+  for (int y = 0; y < 8; ++y) {
+    int sy = sy4 - MultiplyBy4(gamma);
+    __m128i filter[8];
+    for (__m128i& f : filter) {
+      const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                         kWarpedPixelPrecisionShifts;
+      f = LoadUnaligned16(kWarpedFilters[offset]);
+      sy += gamma;
+    }
+    Transpose8x8_U16(filter, filter);
+    WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
+    dest_row += dest_stride;
+    sy4 += delta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma,
+                           int delta, DestType* dest_row,
+                           ptrdiff_t dest_stride) {
+  int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+  for (int y = 0; y < 8; ++y) {
+    int sy = sy4 - MultiplyBy4(gamma);
+    __m128i filter[8];
+    for (__m128i& f : filter) {
+      const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                         kWarpedPixelPrecisionShifts;
+      f = LoadUnaligned16(kWarpedFilters[offset]);
+      sy += gamma;
+    }
+    Transpose8x8_U16(filter, filter);
+    WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
+    dest_row += dest_stride;
+    sy4 += delta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride,
+                        int source_width, int source_height, int ix4, int iy4,
+                        DestType* dst_row, ptrdiff_t dest_stride) {
+  // Region 1
+  // Points to the left or right border of the first row of |src|.
+  const uint8_t* first_row_border =
+      (ix4 + 7 <= 0) ? src : src + source_width - 1;
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Region 1.
+  // Every sample used to calculate the prediction block has the same
+  // value. So the whole prediction block has the same value.
+  const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+  const uint8_t row_border_pixel = first_row_border[row * source_stride];
+
+  if (is_compound) {
+    const __m128i sum =
+        _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
+                                            kInterRoundBitsCompoundVertical));
+    StoreUnaligned16(dst_row, sum);
+  } else {
+    memset(dst_row, row_border_pixel, 8);
+  }
+  const DestType* const first_dst_row = dst_row;
+  dst_row += dest_stride;
+  for (int y = 1; y < 8; ++y) {
+    memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+    dst_row += dest_stride;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride,
+                        int source_width, int y4, int ix4, int iy4, int gamma,
+                        int delta, int16_t intermediate_result_column[15],
+                        DestType* dst_row, ptrdiff_t dest_stride) {
+  // Region 2.
+  // Points to the left or right border of the first row of |src|.
+  const uint8_t* first_row_border =
+      (ix4 + 7 <= 0) ? src : src + source_width - 1;
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+
+  // Region 2.
+  // Horizontal filter.
+  // The input values in this region are generated by extending the border
+  // which makes them identical in the horizontal direction. This
+  // computation could be inlined in the vertical pass but most
+  // implementations will need a transpose of some sort.
+  // It is not necessary to use the offset values here because the
+  // horizontal pass is a simple shift and the vertical pass will always
+  // require using 32 bits.
+  for (int y = -7; y < 8; ++y) {
+    // We may over-read up to 13 pixels above the top source row, or up
+    // to 13 pixels below the bottom source row. This is proved in
+    // warp.cc.
+    const int row = iy4 + y;
+    int sum = first_row_border[row * source_stride];
+    sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+    intermediate_result_column[y + 7] = sum;
+  }
+  // Region 2 vertical filter.
+  VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
+                                        delta, dst_row, dest_stride);
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride,
+                        int source_height, int alpha, int beta, int x4, int ix4,
+                        int iy4, int16_t intermediate_result[15][8]) {
+  // Region 3
+  // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Horizontal filter.
+  const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+  const uint8_t* const src_row = src + row * source_stride;
+  // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+  // read but is ignored.
+  //
+  // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+  // bytes after src_row[source_width - 1]. We assume the source frame
+  // has left and right borders of at least 13 bytes that extend the
+  // frame boundary pixels. We also assume there is at least one extra
+  // padding byte after the right border of the last source row.
+  const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+  int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+  for (int y = -7; y < 8; ++y) {
+    HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+    sx4 += beta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha,
+                        int beta, int x4, int ix4, int iy4,
+                        int16_t intermediate_result[15][8]) {
+  // Region 4.
+  // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Horizontal filter.
+  int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+  for (int y = -7; y < 8; ++y) {
+    // We may over-read up to 13 pixels above the top source row, or up
+    // to 13 pixels below the bottom source row. This is proved in
+    // warp.cc.
+    const int row = iy4 + y;
+    const uint8_t* const src_row = src + row * source_stride;
+    // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+    // read but is ignored.
+    //
+    // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+    // bytes after src_row[source_width - 1]. We assume the source frame
+    // has left and right borders of at least 13 bytes that extend the
+    // frame boundary pixels. We also assume there is at least one extra
+    // padding byte after the right border of the last source row.
+    const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+    // Convert src_row_v to int8 (subtract 128).
+    HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+    sx4 += beta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride,
+                            int source_width, int source_height,
+                            const int* warp_params, int subsampling_x,
+                            int subsampling_y, int src_x, int src_y,
+                            int16_t alpha, int16_t beta, int16_t gamma,
+                            int16_t delta, DestType* dst_row,
+                            ptrdiff_t dest_stride) {
+  union {
+    // Intermediate_result is the output of the horizontal filtering and
+    // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+    // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+    // type so that we can start with a negative offset and restore it on the
+    // final filter sum.
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+
+  const int dst_x =
+      src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+  const int dst_y =
+      src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+  const int x4 = dst_x >> subsampling_x;
+  const int y4 = dst_y >> subsampling_y;
+  const int ix4 = x4 >> kWarpedModelPrecisionBits;
+  const int iy4 = y4 >> kWarpedModelPrecisionBits;
+  // A prediction block may fall outside the frame's boundaries. If a
+  // prediction block is calculated using only samples outside the frame's
+  // boundary, the filtering can be simplified. We can divide the plane
+  // into several regions and handle them differently.
+  //
+  //                |           |
+  //            1   |     3     |   1
+  //                |           |
+  //         -------+-----------+-------
+  //                |***********|
+  //            2   |*****4*****|   2
+  //                |***********|
+  //         -------+-----------+-------
+  //                |           |
+  //            1   |     3     |   1
+  //                |           |
+  //
+  // At the center, region 4 represents the frame and is the general case.
+  //
+  // In regions 1 and 2, the prediction block is outside the frame's
+  // boundary horizontally. Therefore the horizontal filtering can be
+  // simplified. Furthermore, in the region 1 (at the four corners), the
+  // prediction is outside the frame's boundary both horizontally and
+  // vertically, so we get a constant prediction block.
+  //
+  // In region 3, the prediction block is outside the frame's boundary
+  // vertically. Unfortunately because we apply the horizontal filters
+  // first, by the time we apply the vertical filters, they no longer see
+  // simple inputs. So the only simplification is that all the rows are
+  // the same, but we still need to apply all the horizontal and vertical
+  // filters.
+
+  // Check for two simple special cases, where the horizontal filter can
+  // be significantly simplified.
+  //
+  // In general, for each row, the horizontal filter is calculated as
+  // follows:
+  //   for (int x = -4; x < 4; ++x) {
+  //     const int offset = ...;
+  //     int sum = first_pass_offset;
+  //     for (int k = 0; k < 8; ++k) {
+  //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+  //       sum += kWarpedFilters[offset][k] * src_row[column];
+  //     }
+  //     ...
+  //   }
+  // The column index before clipping, ix4 + x + k - 3, varies in the range
+  // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+  // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+  // border index (source_width - 1 or 0, respectively). Then for each x,
+  // the inner for loop of the horizontal filter is reduced to multiplying
+  // the border pixel by the sum of the filter coefficients.
+  if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+    if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+      // Outside the frame in both directions. One repeated value.
+      WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
+                                         source_height, ix4, iy4, dst_row,
+                                         dest_stride);
+      return;
+    }
+    // Outside the frame horizontally. Rows repeated.
+    WarpRegion2<is_compound, DestType>(
+        src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
+        intermediate_result_column, dst_row, dest_stride);
+    return;
+  }
+
+  if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+    // Outside the frame vertically.
+    WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
+                                       beta, x4, ix4, iy4, intermediate_result);
+  } else {
+    // Inside the frame.
+    WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
+                                       iy4, intermediate_result);
+  }
+  // Region 3 and 4 vertical filter.
+  VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
+                                        dst_row, dest_stride);
+}
+
+template <bool is_compound>
+void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
+                 int source_height, const int* warp_params, int subsampling_x,
+                 int subsampling_y, int block_start_x, int block_start_y,
+                 int block_width, int block_height, int16_t alpha, int16_t beta,
+                 int16_t gamma, int16_t delta, void* dest,
+                 ptrdiff_t dest_stride) {
+  const auto* const src = static_cast<const uint8_t*>(source);
+  using DestType =
+      typename std::conditional<is_compound, int16_t, uint8_t>::type;
+  auto* dst = static_cast<DestType*>(dest);
+
+  // Warp process applies for each 8x8 block.
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+  const int block_end_x = block_start_x + block_width;
+  const int block_end_y = block_start_y + block_height;
+
+  const int start_x = block_start_x;
+  const int start_y = block_start_y;
+  int src_x = (start_x + 4) << subsampling_x;
+  int src_y = (start_y + 4) << subsampling_y;
+  const int end_x = (block_end_x + 4) << subsampling_x;
+  const int end_y = (block_end_y + 4) << subsampling_y;
+  do {
+    DestType* dst_row = dst;
+    src_x = (start_x + 4) << subsampling_x;
+    do {
+      HandleWarpBlock<is_compound, DestType>(
+          src, source_stride, source_width, source_height, warp_params,
+          subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
+          dst_row, dest_stride);
+      src_x += (8 << subsampling_x);
+      dst_row += 8;
+    } while (src_x < end_x);
+    dst += 8 * dest_stride;
+    src_y += (8 << subsampling_y);
+  } while (src_y < end_y);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
+  dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/warp_sse4.h b/src/dsp/x86/warp_sse4.h
new file mode 100644
index 0000000..a2dc5ca
--- /dev/null
+++ b/src/dsp/x86/warp_sse4.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Warp
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
new file mode 100644
index 0000000..dfd5662
--- /dev/null
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -0,0 +1,464 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/weight_mask_sse4.h"
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kRoundingBits8bpp = 4;
+
+template <bool mask_is_inverse>
+inline void WeightMask8_SSE4(const int16_t* prediction_0,
+                             const int16_t* prediction_1, uint8_t* mask) {
+  const __m128i pred_0 = LoadAligned16(prediction_0);
+  const __m128i pred_1 = LoadAligned16(prediction_1);
+  const __m128i difference = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp);
+  const __m128i scaled_difference = _mm_srli_epi16(difference, 4);
+  const __m128i difference_offset = _mm_set1_epi8(38);
+  const __m128i adjusted_difference =
+      _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference),
+                    difference_offset);
+  const __m128i mask_ceiling = _mm_set1_epi8(64);
+  const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
+  if (mask_is_inverse) {
+    const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+    StoreLo8(mask, inverted_mask_value);
+  } else {
+    StoreLo8(mask, mask_value);
+  }
+}
+
+#define WEIGHT8_WITHOUT_STRIDE \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask)
+
+#define WEIGHT8_AND_STRIDE \
+  WEIGHT8_WITHOUT_STRIDE;  \
+  pred_0 += 8;             \
+  pred_1 += 8;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
+                        uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+  } while (++y < 7);
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1,
+                         uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1,
+                         uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT8_AND_STRIDE;
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE                            \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+
+#define WEIGHT16_AND_STRIDE \
+  WEIGHT16_WITHOUT_STRIDE;  \
+  pred_0 += 16;             \
+  pred_1 += 16;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1,
+                         uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+  } while (++y < 7);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT16_AND_STRIDE;
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE                                           \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask);                \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8);    \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+
+#define WEIGHT32_AND_STRIDE \
+  WEIGHT32_WITHOUT_STRIDE;  \
+  pred_0 += 32;             \
+  pred_1 += 32;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_SSE4(const void* prediction_0, const void* prediction_1,
+                         uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE                                           \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask);                \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8);    \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
+  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+
+#define WEIGHT64_AND_STRIDE \
+  WEIGHT64_WITHOUT_STRIDE;  \
+  pred_0 += 64;             \
+  pred_1 += 64;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_SSE4(const void* prediction_0, const void* prediction_1,
+                          uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_SSE4(const void* prediction_0, const void* prediction_1,
+                           uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 42);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_SSE4(const void* prediction_0, const void* prediction_1,
+                           uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_SSE4(const void* prediction_0, const void* prediction_1,
+                            uint8_t* mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 42);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                      \
+      WeightMask##width##x##height##_SSE4<0>;                  \
+  dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1>
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h
new file mode 100644
index 0000000..07636b7
--- /dev/null
+++ b/src/dsp/x86/weight_mask_sse4.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
diff --git a/src/film_grain.cc b/src/film_grain.cc
new file mode 100644
index 0000000..dac37b5
--- /dev/null
+++ b/src/film_grain.cc
@@ -0,0 +1,817 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+namespace {
+
+// The kGaussianSequence array contains random samples from a Gaussian
+// distribution with zero mean and standard deviation of about 512 clipped to
+// the range of [-2048, 2047] (representable by a signed integer using 12 bits
+// of precision) and rounded to the nearest multiple of 4.
+//
+// Note: It is important that every element in the kGaussianSequence array be
+// less than 2040, so that RightShiftWithRounding(kGaussianSequence[i], 4) is
+// less than 128 for bitdepth=8 (GrainType=int8_t).
+constexpr int16_t kGaussianSequence[/*2048*/] = {
+    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+    428,   -484};
+static_assert(sizeof(kGaussianSequence) / sizeof(kGaussianSequence[0]) == 2048,
+              "");
+
+// The number of rows in a contiguous group computed by a single worker thread
+// before checking for the next available group.
+constexpr int kFrameChunkHeight = 8;
+
+// |width| and |height| refer to the plane, not the frame, meaning any
+// subsampling should be applied by the caller.
+template <typename Pixel>
+inline void CopyImagePlane(const uint8_t* source_plane, ptrdiff_t source_stride,
+                           int width, int height, uint8_t* dest_plane,
+                           ptrdiff_t dest_stride) {
+  // If it's the same buffer there's nothing to do.
+  if (source_plane == dest_plane) return;
+
+  int y = 0;
+  do {
+    memcpy(dest_plane, source_plane, width * sizeof(Pixel));
+    source_plane += source_stride;
+    dest_plane += dest_stride;
+  } while (++y < height);
+}
+
+}  // namespace
+
+template <int bitdepth>
+FilmGrain<bitdepth>::FilmGrain(const FilmGrainParams& params,
+                               bool is_monochrome,
+                               bool color_matrix_is_identity, int subsampling_x,
+                               int subsampling_y, int width, int height,
+                               ThreadPool* thread_pool)
+    : params_(params),
+      is_monochrome_(is_monochrome),
+      color_matrix_is_identity_(color_matrix_is_identity),
+      subsampling_x_(subsampling_x),
+      subsampling_y_(subsampling_y),
+      width_(width),
+      height_(height),
+      template_uv_width_((subsampling_x != 0) ? kMinChromaWidth
+                                              : kMaxChromaWidth),
+      template_uv_height_((subsampling_y != 0) ? kMinChromaHeight
+                                               : kMaxChromaHeight),
+      thread_pool_(thread_pool) {}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::Init() {
+  // Section 7.18.3.3. Generate grain process.
+  const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+  // If params_.num_y_points is 0, luma_grain_ will never be read, so we don't
+  // need to generate it.
+  const bool use_luma = params_.num_y_points > 0;
+  if (use_luma) {
+    GenerateLumaGrain(params_, luma_grain_);
+    // If params_.auto_regression_coeff_lag is 0, the filter is the identity
+    // filter and therefore can be skipped.
+    if (params_.auto_regression_coeff_lag > 0) {
+      dsp.film_grain
+          .luma_auto_regression[params_.auto_regression_coeff_lag - 1](
+              params_, luma_grain_);
+    }
+  } else {
+    // Have AddressSanitizer warn if luma_grain_ is used.
+    ASAN_POISON_MEMORY_REGION(luma_grain_, sizeof(luma_grain_));
+  }
+  if (!is_monochrome_) {
+    GenerateChromaGrains(params_, template_uv_width_, template_uv_height_,
+                         u_grain_, v_grain_);
+    if (params_.auto_regression_coeff_lag > 0 || use_luma) {
+      dsp.film_grain.chroma_auto_regression[static_cast<int>(
+          use_luma)][params_.auto_regression_coeff_lag](
+          params_, luma_grain_, subsampling_x_, subsampling_y_, u_grain_,
+          v_grain_);
+    }
+  }
+
+  // Section 7.18.3.4. Scaling lookup initialization process.
+
+  // Initialize scaling_lut_y_. If params_.num_y_points > 0, scaling_lut_y_
+  // is used for the Y plane. If params_.chroma_scaling_from_luma is true,
+  // scaling_lut_u_ and scaling_lut_v_ are the same as scaling_lut_y_ and are
+  // set up as aliases. So we need to initialize scaling_lut_y_ under these
+  // two conditions.
+  //
+  // Note: Although it does not seem to make sense, there are test vectors
+  // with chroma_scaling_from_luma=true and params_.num_y_points=0.
+  if (use_luma || params_.chroma_scaling_from_luma) {
+    dsp.film_grain.initialize_scaling_lut(
+        params_.num_y_points, params_.point_y_value, params_.point_y_scaling,
+        scaling_lut_y_);
+  } else {
+    ASAN_POISON_MEMORY_REGION(scaling_lut_y_, sizeof(scaling_lut_y_));
+  }
+  if (!is_monochrome_) {
+    if (params_.chroma_scaling_from_luma) {
+      scaling_lut_u_ = scaling_lut_y_;
+      scaling_lut_v_ = scaling_lut_y_;
+    } else if (params_.num_u_points > 0 || params_.num_v_points > 0) {
+      const size_t buffer_size =
+          (kScalingLookupTableSize + kScalingLookupTablePadding) *
+          (static_cast<int>(params_.num_u_points > 0) +
+           static_cast<int>(params_.num_v_points > 0));
+      scaling_lut_chroma_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+      if (scaling_lut_chroma_buffer_ == nullptr) return false;
+
+      uint8_t* buffer = scaling_lut_chroma_buffer_.get();
+      if (params_.num_u_points > 0) {
+        scaling_lut_u_ = buffer;
+        dsp.film_grain.initialize_scaling_lut(
+            params_.num_u_points, params_.point_u_value,
+            params_.point_u_scaling, scaling_lut_u_);
+        buffer += kScalingLookupTableSize + kScalingLookupTablePadding;
+      }
+      if (params_.num_v_points > 0) {
+        scaling_lut_v_ = buffer;
+        dsp.film_grain.initialize_scaling_lut(
+            params_.num_v_points, params_.point_v_value,
+            params_.point_v_scaling, scaling_lut_v_);
+      }
+    }
+  }
+  return true;
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateLumaGrain(const FilmGrainParams& params,
+                                            GrainType* luma_grain) {
+  // If params.num_y_points is equal to 0, Section 7.18.3.3 specifies we set
+  // the luma_grain array to all zeros. But the Note at the end of Section
+  // 7.18.3.3 says luma_grain "will never be read in this case". So we don't
+  // call GenerateLumaGrain if params.num_y_points is equal to 0.
+  assert(params.num_y_points > 0);
+  const int shift = 12 - bitdepth + params.grain_scale_shift;
+  uint16_t seed = params.grain_seed;
+  GrainType* luma_grain_row = luma_grain;
+  for (int y = 0; y < kLumaHeight; ++y) {
+    for (int x = 0; x < kLumaWidth; ++x) {
+      luma_grain_row[x] = RightShiftWithRounding(
+          kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+    }
+    luma_grain_row += kLumaWidth;
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateChromaGrains(const FilmGrainParams& params,
+                                               int chroma_width,
+                                               int chroma_height,
+                                               GrainType* u_grain,
+                                               GrainType* v_grain) {
+  const int shift = 12 - bitdepth + params.grain_scale_shift;
+  if (params.num_u_points == 0 && !params.chroma_scaling_from_luma) {
+    memset(u_grain, 0, chroma_height * chroma_width * sizeof(*u_grain));
+  } else {
+    uint16_t seed = params.grain_seed ^ 0xb524;
+    GrainType* u_grain_row = u_grain;
+    assert(chroma_width > 0);
+    assert(chroma_height > 0);
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        u_grain_row[x] = RightShiftWithRounding(
+            kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+      } while (++x < chroma_width);
+
+      u_grain_row += chroma_width;
+    } while (++y < chroma_height);
+  }
+  if (params.num_v_points == 0 && !params.chroma_scaling_from_luma) {
+    memset(v_grain, 0, chroma_height * chroma_width * sizeof(*v_grain));
+  } else {
+    GrainType* v_grain_row = v_grain;
+    uint16_t seed = params.grain_seed ^ 0x49d8;
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        v_grain_row[x] = RightShiftWithRounding(
+            kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+      } while (++x < chroma_width);
+
+      v_grain_row += chroma_width;
+    } while (++y < chroma_height);
+  }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseStripes() {
+  const int half_height = DivideBy2(height_ + 1);
+  assert(half_height > 0);
+  // ceil(half_height / 16.0)
+  const int max_luma_num = DivideBy16(half_height + 15);
+  constexpr int kNoiseStripeHeight = 34;
+  size_t noise_buffer_size = kNoiseStripePadding;
+  if (params_.num_y_points > 0) {
+    noise_buffer_size += max_luma_num * kNoiseStripeHeight * width_;
+  }
+  if (!is_monochrome_) {
+    noise_buffer_size += 2 * max_luma_num *
+                         (kNoiseStripeHeight >> subsampling_y_) *
+                         SubsampledValue(width_, subsampling_x_);
+  }
+  noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]);
+  if (noise_buffer_ == nullptr) return false;
+  GrainType* noise_buffer = noise_buffer_.get();
+  if (params_.num_y_points > 0) {
+    noise_stripes_[kPlaneY].Reset(max_luma_num, kNoiseStripeHeight * width_,
+                                  noise_buffer);
+    noise_buffer += max_luma_num * kNoiseStripeHeight * width_;
+  }
+  if (!is_monochrome_) {
+    noise_stripes_[kPlaneU].Reset(max_luma_num,
+                                  (kNoiseStripeHeight >> subsampling_y_) *
+                                      SubsampledValue(width_, subsampling_x_),
+                                  noise_buffer);
+    noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) *
+                    SubsampledValue(width_, subsampling_x_);
+    noise_stripes_[kPlaneV].Reset(max_luma_num,
+                                  (kNoiseStripeHeight >> subsampling_y_) *
+                                      SubsampledValue(width_, subsampling_x_),
+                                  noise_buffer);
+  }
+  return true;
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseImage() {
+  if (params_.num_y_points > 0 &&
+      !noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding,
+                                   /*zero_initialize=*/false)) {
+    return false;
+  }
+  if (!is_monochrome_) {
+    if (!noise_image_[kPlaneU].Reset(
+            (height_ + subsampling_y_) >> subsampling_y_,
+            ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+            /*zero_initialize=*/false)) {
+      return false;
+    }
+    if (!noise_image_[kPlaneV].Reset(
+            (height_ + subsampling_y_) >> subsampling_y_,
+            ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+            /*zero_initialize=*/false)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Uses |overlap_flag| to skip rows that are covered by the overlap computation.
+template <int bitdepth>
+void FilmGrain<bitdepth>::ConstructNoiseImage(
+    const Array2DView<GrainType>* noise_stripes, int width, int height,
+    int subsampling_x, int subsampling_y, int stripe_start_offset,
+    Array2D<GrainType>* noise_image) {
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = 0;
+  // |luma_num| = y >> (5 - |subsampling_y|). Hence |luma_num| == 0 for all y up
+  // to either 16 or 32.
+  const GrainType* first_noise_stripe = (*noise_stripes)[0];
+  do {
+    memcpy((*noise_image)[y], first_noise_stripe + y * plane_width,
+           plane_width * sizeof(first_noise_stripe[0]));
+  } while (++y < std::min(stripe_height, plane_height));
+  // End special iterations for luma_num == 0.
+
+  int luma_num = 1;
+  for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    int i = stripe_start_offset;
+    do {
+      memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+             plane_width * sizeof(noise_stripe[0]));
+    } while (++i < stripe_height);
+  }
+
+  // If there is a partial stripe, copy any rows beyond the overlap rows.
+  const int remaining_height = plane_height - y;
+  if (remaining_height > stripe_start_offset) {
+    assert(luma_num < noise_stripes->rows());
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    int i = stripe_start_offset;
+    do {
+      memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+             plane_width * sizeof(noise_stripe[0]));
+    } while (++i < remaining_height);
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseChromaWorker(
+    const dsp::Dsp& dsp, const Plane* planes, int num_planes,
+    std::atomic<int>* job_counter, int min_value, int max_chroma,
+    const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+    ptrdiff_t source_stride_uv, uint8_t* dest_plane_u, uint8_t* dest_plane_v,
+    ptrdiff_t dest_stride_uv) {
+  assert(num_planes > 0);
+  const int full_jobs_per_plane = height_ / kFrameChunkHeight;
+  const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+  const int total_full_jobs = full_jobs_per_plane * num_planes;
+  // If the frame height is not a multiple of kFrameChunkHeight, one job with
+  // a smaller number of rows is necessary at the end of each plane.
+  const int total_jobs =
+      total_full_jobs + ((remainder_job_height == 0) ? 0 : num_planes);
+  int job_index;
+  // Each job corresponds to a slice of kFrameChunkHeight rows in the luma
+  // plane. dsp->blend_noise_chroma handles subsampling.
+  // This loop body handles a slice of one plane or the other, depending on
+  // which are active. That way, threads working on consecutive jobs will keep
+  // the same region of luma source in working memory.
+  while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+         total_jobs) {
+    const Plane plane = planes[job_index % num_planes];
+    const int slice_index = job_index / num_planes;
+    const int start_height = slice_index * kFrameChunkHeight;
+    const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+    const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+        source_plane_y + start_height * source_stride_y);
+    const uint8_t* scaling_lut_uv;
+    const uint8_t* source_plane_uv;
+    uint8_t* dest_plane_uv;
+
+    if (plane == kPlaneU) {
+      scaling_lut_uv = scaling_lut_u_;
+      source_plane_uv = source_plane_u;
+      dest_plane_uv = dest_plane_u;
+    } else {
+      assert(plane == kPlaneV);
+      scaling_lut_uv = scaling_lut_v_;
+      source_plane_uv = source_plane_v;
+      dest_plane_uv = dest_plane_v;
+    }
+    const auto* source_cursor_uv = reinterpret_cast<const Pixel*>(
+        source_plane_uv + (start_height >> subsampling_y_) * source_stride_uv);
+    auto* dest_cursor_uv = reinterpret_cast<Pixel*>(
+        dest_plane_uv + (start_height >> subsampling_y_) * dest_stride_uv);
+    dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+        plane, params_, noise_image_, min_value, max_chroma, width_, job_height,
+        start_height, subsampling_x_, subsampling_y_, scaling_lut_uv,
+        source_cursor_y, source_stride_y, source_cursor_uv, source_stride_uv,
+        dest_cursor_uv, dest_stride_uv);
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseLumaWorker(
+    const dsp::Dsp& dsp, std::atomic<int>* job_counter, int min_value,
+    int max_luma, const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    uint8_t* dest_plane_y, ptrdiff_t dest_stride_y) {
+  const int total_full_jobs = height_ / kFrameChunkHeight;
+  const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+  const int total_jobs =
+      total_full_jobs + static_cast<int>(remainder_job_height > 0);
+  int job_index;
+  // Each job is some number of rows in a plane.
+  while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+         total_jobs) {
+    const int start_height = job_index * kFrameChunkHeight;
+    const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+    const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+        source_plane_y + start_height * source_stride_y);
+    auto* dest_cursor_y =
+        reinterpret_cast<Pixel*>(dest_plane_y + start_height * dest_stride_y);
+    dsp.film_grain.blend_noise_luma(
+        noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+        job_height, start_height, scaling_lut_y_, source_cursor_y,
+        source_stride_y, dest_cursor_y, dest_stride_y);
+  }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AddNoise(
+    const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+    ptrdiff_t source_stride_uv, uint8_t* dest_plane_y, ptrdiff_t dest_stride_y,
+    uint8_t* dest_plane_u, uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv) {
+  if (!Init()) {
+    LIBGAV1_DLOG(ERROR, "Init() failed.");
+    return false;
+  }
+  if (!AllocateNoiseStripes()) {
+    LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
+    return false;
+  }
+
+  const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+  const bool use_luma = params_.num_y_points > 0;
+
+  // Construct noise stripes.
+  if (use_luma) {
+    // The luma plane is never subsampled.
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            luma_grain_, params_.grain_seed, width_, height_,
+            /*subsampling_x=*/0, /*subsampling_y=*/0, &noise_stripes_[kPlaneY]);
+  }
+  if (!is_monochrome_) {
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            u_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+            subsampling_y_, &noise_stripes_[kPlaneU]);
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            v_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+            subsampling_y_, &noise_stripes_[kPlaneV]);
+  }
+
+  if (!AllocateNoiseImage()) {
+    LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
+    return false;
+  }
+
+  // Construct noise image.
+  if (use_luma) {
+    ConstructNoiseImage(
+        &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+        /*subsampling_y=*/0, static_cast<int>(params_.overlap_flag) << 1,
+        &noise_image_[kPlaneY]);
+    if (params_.overlap_flag) {
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+          /*subsampling_y=*/0, &noise_image_[kPlaneY]);
+    }
+  }
+  if (!is_monochrome_) {
+    ConstructNoiseImage(&noise_stripes_[kPlaneU], width_, height_,
+                        subsampling_x_, subsampling_y_,
+                        static_cast<int>(params_.overlap_flag)
+                            << (1 - subsampling_y_),
+                        &noise_image_[kPlaneU]);
+    ConstructNoiseImage(&noise_stripes_[kPlaneV], width_, height_,
+                        subsampling_x_, subsampling_y_,
+                        static_cast<int>(params_.overlap_flag)
+                            << (1 - subsampling_y_),
+                        &noise_image_[kPlaneV]);
+    if (params_.overlap_flag) {
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneU], width_, height_, subsampling_x_,
+          subsampling_y_, &noise_image_[kPlaneU]);
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneV], width_, height_, subsampling_x_,
+          subsampling_y_, &noise_image_[kPlaneV]);
+    }
+  }
+
+  // Blend noise image.
+  int min_value;
+  int max_luma;
+  int max_chroma;
+  if (params_.clip_to_restricted_range) {
+    min_value = 16 << (bitdepth - 8);
+    max_luma = 235 << (bitdepth - 8);
+    if (color_matrix_is_identity_) {
+      max_chroma = max_luma;
+    } else {
+      max_chroma = 240 << (bitdepth - 8);
+    }
+  } else {
+    min_value = 0;
+    max_luma = (256 << (bitdepth - 8)) - 1;
+    max_chroma = max_luma;
+  }
+
+  // Handle all chroma planes first because luma source may be altered in place.
+  if (!is_monochrome_) {
+    // This is done in a strange way but Vector can't be passed by copy to the
+    // lambda capture that spawns the thread.
+    Plane planes_to_blend[2];
+    int num_planes = 0;
+    if (params_.chroma_scaling_from_luma) {
+      // Both noise planes are computed from the luma scaling lookup table.
+      planes_to_blend[num_planes++] = kPlaneU;
+      planes_to_blend[num_planes++] = kPlaneV;
+    } else {
+      const int height_uv = SubsampledValue(height_, subsampling_y_);
+      const int width_uv = SubsampledValue(width_, subsampling_x_);
+
+      // Noise is applied according to a lookup table defined by pieceiwse
+      // linear "points." If the lookup table is empty, that corresponds to
+      // outputting zero noise.
+      if (params_.num_u_points == 0) {
+        CopyImagePlane<Pixel>(source_plane_u, source_stride_uv, width_uv,
+                              height_uv, dest_plane_u, dest_stride_uv);
+      } else {
+        planes_to_blend[num_planes++] = kPlaneU;
+      }
+      if (params_.num_v_points == 0) {
+        CopyImagePlane<Pixel>(source_plane_v, source_stride_uv, width_uv,
+                              height_uv, dest_plane_v, dest_stride_uv);
+      } else {
+        planes_to_blend[num_planes++] = kPlaneV;
+      }
+    }
+    if (thread_pool_ != nullptr && num_planes > 0) {
+      const int num_workers = thread_pool_->num_threads();
+      BlockingCounter pending_workers(num_workers);
+      std::atomic<int> job_counter(0);
+      for (int i = 0; i < num_workers; ++i) {
+        thread_pool_->Schedule([this, dsp, &pending_workers, &planes_to_blend,
+                                num_planes, &job_counter, min_value, max_chroma,
+                                source_plane_y, source_stride_y, source_plane_u,
+                                source_plane_v, source_stride_uv, dest_plane_u,
+                                dest_plane_v, dest_stride_uv]() {
+          BlendNoiseChromaWorker(dsp, planes_to_blend, num_planes, &job_counter,
+                                 min_value, max_chroma, source_plane_y,
+                                 source_stride_y, source_plane_u,
+                                 source_plane_v, source_stride_uv, dest_plane_u,
+                                 dest_plane_v, dest_stride_uv);
+          pending_workers.Decrement();
+        });
+      }
+      BlendNoiseChromaWorker(
+          dsp, planes_to_blend, num_planes, &job_counter, min_value, max_chroma,
+          source_plane_y, source_stride_y, source_plane_u, source_plane_v,
+          source_stride_uv, dest_plane_u, dest_plane_v, dest_stride_uv);
+
+      pending_workers.Wait();
+    } else {
+      // Single threaded.
+      if (params_.num_u_points > 0 || params_.chroma_scaling_from_luma) {
+        dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+            kPlaneU, params_, noise_image_, min_value, max_chroma, width_,
+            height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+            scaling_lut_u_, source_plane_y, source_stride_y, source_plane_u,
+            source_stride_uv, dest_plane_u, dest_stride_uv);
+      }
+      if (params_.num_v_points > 0 || params_.chroma_scaling_from_luma) {
+        dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+            kPlaneV, params_, noise_image_, min_value, max_chroma, width_,
+            height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+            scaling_lut_v_, source_plane_y, source_stride_y, source_plane_v,
+            source_stride_uv, dest_plane_v, dest_stride_uv);
+      }
+    }
+  }
+  if (use_luma) {
+    if (thread_pool_ != nullptr) {
+      const int num_workers = thread_pool_->num_threads();
+      BlockingCounter pending_workers(num_workers);
+      std::atomic<int> job_counter(0);
+      for (int i = 0; i < num_workers; ++i) {
+        thread_pool_->Schedule(
+            [this, dsp, &pending_workers, &job_counter, min_value, max_luma,
+             source_plane_y, source_stride_y, dest_plane_y, dest_stride_y]() {
+              BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+                                   source_plane_y, source_stride_y,
+                                   dest_plane_y, dest_stride_y);
+              pending_workers.Decrement();
+            });
+      }
+
+      BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+                           source_plane_y, source_stride_y, dest_plane_y,
+                           dest_stride_y);
+      pending_workers.Wait();
+    } else {
+      dsp.film_grain.blend_noise_luma(
+          noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+          height_, /*start_height=*/0, scaling_lut_y_, source_plane_y,
+          source_stride_y, dest_plane_y, dest_stride_y);
+    }
+  } else {
+    CopyImagePlane<Pixel>(source_plane_y, source_stride_y, width_, height_,
+                          dest_plane_y, dest_stride_y);
+  }
+
+  return true;
+}
+
+// Explicit instantiations.
+template class FilmGrain<8>;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template class FilmGrain<10>;
+#endif
+
+}  // namespace libgav1
diff --git a/src/film_grain.h b/src/film_grain.h
new file mode 100644
index 0000000..b588f6d
--- /dev/null
+++ b/src/film_grain.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FILM_GRAIN_H_
+#define LIBGAV1_SRC_FILM_GRAIN_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// Film grain synthesis function signature. Section 7.18.3.
+// This function generates film grain noise and blends the noise with the
+// decoded frame.
+// |source_plane_y|, |source_plane_u|, and |source_plane_v| are the plane
+// buffers of the decoded frame. They are blended with the film grain noise and
+// written to |dest_plane_y|, |dest_plane_u|, and |dest_plane_v| as final
+// output for display. |source_plane_p| and |dest_plane_p| (where p is y, u, or
+// v) may point to the same buffer, in which case the film grain noise is added
+// in place.
+// |film_grain_params| are parameters read from frame header.
+// |is_monochrome| is true indicates only Y plane needs to be processed.
+// |color_matrix_is_identity| is true if the matrix_coefficients field in the
+// sequence header's color config is is MC_IDENTITY.
+// |width| is the upscaled width of the frame.
+// |height| is the frame height.
+// |subsampling_x| and |subsampling_y| are subsamplings for UV planes, not used
+// if |is_monochrome| is true.
+// Returns true on success, or false on failure (e.g., out of memory).
+using FilmGrainSynthesisFunc = bool (*)(
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_u, ptrdiff_t source_stride_u,
+    const void* source_plane_v, ptrdiff_t source_stride_v,
+    const FilmGrainParams& film_grain_params, bool is_monochrome,
+    bool color_matrix_is_identity, int width, int height, int subsampling_x,
+    int subsampling_y, void* dest_plane_y, ptrdiff_t dest_stride_y,
+    void* dest_plane_u, ptrdiff_t dest_stride_u, void* dest_plane_v,
+    ptrdiff_t dest_stride_v);
+
+// Section 7.18.3.5. Add noise synthesis process.
+template <int bitdepth>
+class FilmGrain {
+ public:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  FilmGrain(const FilmGrainParams& params, bool is_monochrome,
+            bool color_matrix_is_identity, int subsampling_x, int subsampling_y,
+            int width, int height, ThreadPool* thread_pool);
+
+  // Note: These static methods are declared public so that the unit tests can
+  // call them.
+
+  static void GenerateLumaGrain(const FilmGrainParams& params,
+                                GrainType* luma_grain);
+
+  // Generates white noise arrays u_grain and v_grain chroma_width samples wide
+  // and chroma_height samples high.
+  static void GenerateChromaGrains(const FilmGrainParams& params,
+                                   int chroma_width, int chroma_height,
+                                   GrainType* u_grain, GrainType* v_grain);
+
+  // Copies rows from |noise_stripes| to |noise_image|, skipping rows that are
+  // subject to overlap.
+  static void ConstructNoiseImage(const Array2DView<GrainType>* noise_stripes,
+                                  int width, int height, int subsampling_x,
+                                  int subsampling_y, int stripe_start_offset,
+                                  Array2D<GrainType>* noise_image);
+
+  // Combines the film grain with the image data.
+  bool AddNoise(const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+                const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+                ptrdiff_t source_stride_uv, uint8_t* dest_plane_y,
+                ptrdiff_t dest_stride_y, uint8_t* dest_plane_u,
+                uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+ private:
+  using Pixel =
+      typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type;
+
+  bool Init();
+
+  // Allocates noise_stripes_.
+  bool AllocateNoiseStripes();
+
+  bool AllocateNoiseImage();
+
+  void BlendNoiseChromaWorker(const dsp::Dsp& dsp, const Plane* planes,
+                              int num_planes, std::atomic<int>* job_counter,
+                              int min_value, int max_chroma,
+                              const uint8_t* source_plane_y,
+                              ptrdiff_t source_stride_y,
+                              const uint8_t* source_plane_u,
+                              const uint8_t* source_plane_v,
+                              ptrdiff_t source_stride_uv, uint8_t* dest_plane_u,
+                              uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+  void BlendNoiseLumaWorker(const dsp::Dsp& dsp, std::atomic<int>* job_counter,
+                            int min_value, int max_luma,
+                            const uint8_t* source_plane_y,
+                            ptrdiff_t source_stride_y, uint8_t* dest_plane_y,
+                            ptrdiff_t dest_stride_y);
+
+  const FilmGrainParams& params_;
+  const bool is_monochrome_;
+  const bool color_matrix_is_identity_;
+  const int subsampling_x_;
+  const int subsampling_y_;
+  // Frame width and height.
+  const int width_;
+  const int height_;
+  // Section 7.18.3.3, Dimensions of the noise templates for chroma, which are
+  // known as CbGrain and CrGrain.
+  // These templates are used to construct the noise image for each plane by
+  // copying 32x32 blocks with pseudorandom offsets, into "noise stripes."
+  // The noise template known as LumaGrain array is an 82x73 block.
+  // The height and width of the templates for chroma become 44 and 38 under
+  // subsampling, respectively.
+  //  For more details see:
+  // A. Norkin and N. Birkbeck, "Film Grain Synthesis for AV1 Video Codec," 2018
+  // Data Compression Conference, Snowbird, UT, 2018, pp. 3-12.
+  const int template_uv_width_;
+  const int template_uv_height_;
+  // LumaGrain. The luma_grain array contains white noise generated for luma.
+  // The array size is fixed but subject to further optimization for SIMD.
+  GrainType luma_grain_[kLumaHeight * kLumaWidth];
+  // CbGrain and CrGrain. The maximum size of the u_grain and v_grain arrays is
+  // kMaxChromaHeight * kMaxChromaWidth. The actual size is
+  // template_uv_height_ * template_uv_width_.
+  GrainType u_grain_[kMaxChromaHeight * kMaxChromaWidth];
+  GrainType v_grain_[kMaxChromaHeight * kMaxChromaWidth];
+  // Scaling lookup tables.
+  uint8_t scaling_lut_y_[kScalingLookupTableSize + kScalingLookupTablePadding];
+  uint8_t* scaling_lut_u_ = nullptr;
+  uint8_t* scaling_lut_v_ = nullptr;
+  // If allocated, this buffer is 256 * 2 bytes long and scaling_lut_u_ and
+  // scaling_lut_v_ point into this buffer. Otherwise, scaling_lut_u_ and
+  // scaling_lut_v_ point to scaling_lut_y_.
+  std::unique_ptr<uint8_t[]> scaling_lut_chroma_buffer_;
+
+  // A two-dimensional array of noise data for each plane. Generated for each 32
+  // luma sample high stripe of the image. The first dimension is called
+  // luma_num. The second dimension is the size of one noise stripe.
+  //
+  // Each row of the Array2DView noise_stripes_[plane] is a conceptually
+  // two-dimensional array of |GrainType|s. The two-dimensional array of
+  // |GrainType|s is flattened into a one-dimensional buffer in this
+  // implementation.
+  //
+  // noise_stripes_[kPlaneY][luma_num] is an array that has 34 rows and
+  // |width_| columns and contains noise for the luma component.
+  //
+  // noise_stripes_[kPlaneU][luma_num] or noise_stripes_[kPlaneV][luma_num]
+  // is an array that has (34 >> subsampling_y_) rows and
+  // SubsampledValue(width_, subsampling_x_) columns and contains noise for the
+  // chroma components.
+  Array2DView<GrainType> noise_stripes_[kMaxPlanes];
+  // Owns the memory that the elements of noise_stripes_ point to.
+  std::unique_ptr<GrainType[]> noise_buffer_;
+
+  Array2D<GrainType> noise_image_[kMaxPlanes];
+  ThreadPool* const thread_pool_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FILM_GRAIN_H_
diff --git a/src/frame_buffer.cc b/src/frame_buffer.cc
new file mode 100644
index 0000000..50c7756
--- /dev/null
+++ b/src/frame_buffer.cc
@@ -0,0 +1,151 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/frame_buffer.h"
+
+#include <cstdint>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBufferInfo* info) {
+  switch (bitdepth) {
+    case 8:
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    case 10:
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+    case 12:
+#endif
+      break;
+    default:
+      return kLibgav1StatusInvalidArgument;
+  }
+  switch (image_format) {
+    case kLibgav1ImageFormatYuv420:
+    case kLibgav1ImageFormatYuv422:
+    case kLibgav1ImageFormatYuv444:
+    case kLibgav1ImageFormatMonochrome400:
+      break;
+    default:
+      return kLibgav1StatusInvalidArgument;
+  }
+  // All int arguments must be nonnegative. Borders must be a multiple of 2.
+  // |stride_alignment| must be a power of 2.
+  if ((width | height | left_border | right_border | top_border |
+       bottom_border | stride_alignment) < 0 ||
+      ((left_border | right_border | top_border | bottom_border) & 1) != 0 ||
+      (stride_alignment & (stride_alignment - 1)) != 0 || info == nullptr) {
+    return kLibgav1StatusInvalidArgument;
+  }
+
+  bool is_monochrome;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+  libgav1::DecomposeImageFormat(image_format, &is_monochrome, &subsampling_x,
+                                &subsampling_y);
+
+  // Calculate y_stride (in bytes). It is padded to a multiple of
+  // |stride_alignment| bytes.
+  int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+  y_stride = libgav1::Align(y_stride, stride_alignment);
+  // Size of the Y buffer in bytes.
+  const uint64_t y_buffer_size =
+      (height + top_border + bottom_border) * static_cast<uint64_t>(y_stride) +
+      (stride_alignment - 1);
+
+  const int uv_width =
+      is_monochrome ? 0 : libgav1::SubsampledValue(width, subsampling_x);
+  const int uv_height =
+      is_monochrome ? 0 : libgav1::SubsampledValue(height, subsampling_y);
+  const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+  const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+  const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+  const int uv_bottom_border =
+      is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+  // Calculate uv_stride (in bytes). It is padded to a multiple of
+  // |stride_alignment| bytes.
+  int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+  uv_stride = libgav1::Align(uv_stride, stride_alignment);
+  // Size of the U or V buffer in bytes.
+  const uint64_t uv_buffer_size =
+      is_monochrome ? 0
+                    : (uv_height + uv_top_border + uv_bottom_border) *
+                              static_cast<uint64_t>(uv_stride) +
+                          (stride_alignment - 1);
+
+  // Check if it is safe to cast y_buffer_size and uv_buffer_size to size_t.
+  if (y_buffer_size > SIZE_MAX || uv_buffer_size > SIZE_MAX) {
+    return kLibgav1StatusInvalidArgument;
+  }
+
+  int left_border_bytes = left_border;
+  int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) {
+    left_border_bytes *= sizeof(uint16_t);
+    uv_left_border_bytes *= sizeof(uint16_t);
+  }
+#endif
+
+  info->y_stride = y_stride;
+  info->uv_stride = uv_stride;
+  info->y_buffer_size = static_cast<size_t>(y_buffer_size);
+  info->uv_buffer_size = static_cast<size_t>(uv_buffer_size);
+  info->y_plane_offset = top_border * y_stride + left_border_bytes;
+  info->uv_plane_offset = uv_top_border * uv_stride + uv_left_border_bytes;
+  info->stride_alignment = stride_alignment;
+  return kLibgav1StatusOk;
+}
+
+Libgav1StatusCode Libgav1SetFrameBuffer(const Libgav1FrameBufferInfo* info,
+                                        uint8_t* y_buffer, uint8_t* u_buffer,
+                                        uint8_t* v_buffer,
+                                        void* buffer_private_data,
+                                        Libgav1FrameBuffer* frame_buffer) {
+  if (info == nullptr ||
+      (info->uv_buffer_size == 0 &&
+       (u_buffer != nullptr || v_buffer != nullptr)) ||
+      frame_buffer == nullptr) {
+    return kLibgav1StatusInvalidArgument;
+  }
+  if (y_buffer == nullptr || (info->uv_buffer_size != 0 &&
+                              (u_buffer == nullptr || v_buffer == nullptr))) {
+    return kLibgav1StatusOutOfMemory;
+  }
+  frame_buffer->plane[0] = libgav1::AlignAddr(y_buffer + info->y_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->plane[1] = libgav1::AlignAddr(u_buffer + info->uv_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->plane[2] = libgav1::AlignAddr(v_buffer + info->uv_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->stride[0] = info->y_stride;
+  frame_buffer->stride[1] = frame_buffer->stride[2] = info->uv_stride;
+  frame_buffer->private_data = buffer_private_data;
+  return kLibgav1StatusOk;
+}
+
+}  // extern "C"
diff --git a/src/frame_buffer_utils.h b/src/frame_buffer_utils.h
new file mode 100644
index 0000000..d41437e
--- /dev/null
+++ b/src/frame_buffer_utils.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+#define LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// The following table is from Section 6.4.2 of the spec.
+//
+// subsampling_x  subsampling_y  mono_chrome  Description
+// -----------------------------------------------------------
+// 0              0              0            YUV 4:4:4
+// 1              0              0            YUV 4:2:2
+// 1              1              0            YUV 4:2:0
+// 1              1              1            Monochrome 4:0:0
+
+inline Libgav1ImageFormat ComposeImageFormat(bool is_monochrome,
+                                             int8_t subsampling_x,
+                                             int8_t subsampling_y) {
+  Libgav1ImageFormat image_format;
+  if (subsampling_x == 0) {
+    assert(subsampling_y == 0 && !is_monochrome);
+    image_format = kLibgav1ImageFormatYuv444;
+  } else if (subsampling_y == 0) {
+    assert(!is_monochrome);
+    image_format = kLibgav1ImageFormatYuv422;
+  } else if (!is_monochrome) {
+    image_format = kLibgav1ImageFormatYuv420;
+  } else {
+    image_format = kLibgav1ImageFormatMonochrome400;
+  }
+  return image_format;
+}
+
+inline void DecomposeImageFormat(Libgav1ImageFormat image_format,
+                                 bool* is_monochrome, int8_t* subsampling_x,
+                                 int8_t* subsampling_y) {
+  *is_monochrome = false;
+  *subsampling_x = 1;
+  *subsampling_y = 1;
+  switch (image_format) {
+    case kLibgav1ImageFormatYuv420:
+      break;
+    case kLibgav1ImageFormatYuv422:
+      *subsampling_y = 0;
+      break;
+    case kLibgav1ImageFormatYuv444:
+      *subsampling_x = *subsampling_y = 0;
+      break;
+    default:
+      assert(image_format == kLibgav1ImageFormatMonochrome400);
+      *is_monochrome = true;
+      break;
+  }
+}
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
diff --git a/src/frame_scratch_buffer.h b/src/frame_scratch_buffer.h
new file mode 100644
index 0000000..90c3bb8
--- /dev/null
+++ b/src/frame_scratch_buffer.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/loop_restoration_info.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/threading_strategy.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Buffer used to store the unfiltered pixels that are necessary for decoding
+// the next superblock row (for the intra prediction process).
+using IntraPredictionBuffer =
+    std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>;
+
+// Buffer to facilitate decoding a frame. This struct is used only within
+// DecoderImpl::DecodeTiles().
+struct FrameScratchBuffer {
+  LoopRestorationInfo loop_restoration_info;
+  Array2D<int16_t> cdef_index;
+  Array2D<TransformSize> inter_transform_sizes;
+  BlockParametersHolder block_parameters_holder;
+  TemporalMotionField motion_field;
+  SymbolDecoderContext symbol_decoder_context;
+  std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
+  // Buffer used to store the cdef borders. This buffer will store 4 rows for
+  // every 64x64 block (4 rows for every 32x32 for chroma with subsampling). The
+  // indices of the rows that are stored are specified in |kCdefBorderRows|.
+  YuvBuffer cdef_border;
+  AlignedDynamicBuffer<uint8_t, 16> superres_coefficients[kNumPlaneTypes];
+  // Buffer used to temporarily store the input row for applying SuperRes.
+  YuvBuffer superres_line_buffer;
+  // Buffer used to store the loop restoration borders. This buffer will store 4
+  // rows for every 64x64 block (4 rows for every 32x32 for chroma with
+  // subsampling). The indices of the rows that are stored are specified in
+  // |kLoopRestorationBorderRows|.
+  YuvBuffer loop_restoration_border;
+  // The size of this dynamic buffer is |tile_rows|.
+  DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
+  TileScratchBufferPool tile_scratch_buffer_pool;
+  ThreadingStrategy threading_strategy;
+  std::mutex superblock_row_mutex;
+  // The size of this buffer is the number of superblock rows.
+  // |superblock_row_progress[i]| is incremented whenever a tile finishes
+  // decoding superblock row at index i. If the count reaches tile_columns, then
+  // |superblock_row_progress_condvar[i]| is notified.
+  DynamicBuffer<int> superblock_row_progress
+      LIBGAV1_GUARDED_BY(superblock_row_mutex);
+  // The size of this buffer is the number of superblock rows. Used to wait for
+  // |superblock_row_progress[i]| to reach tile_columns.
+  DynamicBuffer<std::condition_variable> superblock_row_progress_condvar;
+  // Used to signal tile decoding failure in the combined multithreading mode.
+  bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex);
+};
+
+class FrameScratchBufferPool {
+ public:
+  std::unique_ptr<FrameScratchBuffer> Get() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (!buffers_.Empty()) {
+      return buffers_.Pop();
+    }
+    lock.unlock();
+    std::unique_ptr<FrameScratchBuffer> scratch_buffer(new (std::nothrow)
+                                                           FrameScratchBuffer);
+    return scratch_buffer;
+  }
+
+  void Release(std::unique_ptr<FrameScratchBuffer> scratch_buffer) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffers_.Push(std::move(scratch_buffer));
+  }
+
+ private:
+  std::mutex mutex_;
+  Stack<std::unique_ptr<FrameScratchBuffer>, kMaxThreads> buffers_
+      LIBGAV1_GUARDED_BY(mutex_);
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
diff --git a/src/gav1/decoder.h b/src/gav1/decoder.h
new file mode 100644
index 0000000..da08da9
--- /dev/null
+++ b/src/gav1/decoder.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_H_
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+// IWYU pragma: begin_exports
+#include "gav1/decoder_buffer.h"
+#include "gav1/decoder_settings.h"
+#include "gav1/frame_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+#include "gav1/version.h"
+// IWYU pragma: end_exports
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct Libgav1Decoder;
+typedef struct Libgav1Decoder Libgav1Decoder;
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderCreate(
+    const Libgav1DecoderSettings* settings, Libgav1Decoder** decoder_out);
+
+LIBGAV1_PUBLIC void Libgav1DecoderDestroy(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderEnqueueFrame(
+    Libgav1Decoder* decoder, const uint8_t* data, size_t size,
+    int64_t user_private_data, void* buffer_private_data);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderDequeueFrame(
+    Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr);
+
+LIBGAV1_PUBLIC Libgav1StatusCode
+Libgav1DecoderSignalEOS(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC int Libgav1DecoderGetMaxBitdepth(void);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Forward declaration.
+class DecoderImpl;
+
+class LIBGAV1_PUBLIC Decoder {
+ public:
+  Decoder();
+  ~Decoder();
+
+  // Init must be called exactly once per instance. Subsequent calls will do
+  // nothing. If |settings| is nullptr, the decoder will be initialized with
+  // default settings. Returns kStatusOk on success, an error status otherwise.
+  StatusCode Init(const DecoderSettings* settings);
+
+  // Enqueues a compressed frame to be decoded.
+  //
+  // This function returns:
+  //   * kStatusOk on success
+  //   * kStatusTryAgain if the decoder queue is full
+  //   * an error status otherwise.
+  //
+  // |user_private_data| may be used to associate application specific private
+  // data with the compressed frame. It will be copied to the user_private_data
+  // field of the DecoderBuffer returned by the corresponding |DequeueFrame()|
+  // call.
+  //
+  // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
+  // successful |EnqueueFrame()| call, the caller must keep the |data| buffer
+  // alive until:
+  // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer
+  // must be kept alive until release_input_buffer is called with the
+  // |buffer_private_data| passed into this EnqueueFrame call.
+  // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must
+  // be kept alive until the corresponding DequeueFrame() call is completed.
+  //
+  // If the call to |EnqueueFrame()| is not successful, then libgav1 will not
+  // hold any references to the |data| buffer. |settings_.release_input_buffer|
+  // callback will not be called in that case.
+  StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+                          int64_t user_private_data, void* buffer_private_data);
+
+  // Dequeues a decompressed frame. If there are enqueued compressed frames,
+  // decodes one and sets |*out_ptr| to the last displayable frame in the
+  // compressed frame. If there are no displayable frames available, sets
+  // |*out_ptr| to nullptr.
+  //
+  // Returns kStatusOk on success. Returns kStatusNothingToDequeue if there are
+  // no enqueued frames (in this case out_ptr will always be set to nullptr).
+  // Returns one of the other error statuses if there is an error.
+  //
+  // If |settings_.blocking_dequeue| is false and the decoder is operating in
+  // frame parallel mode (|settings_.frame_parallel| is true and the video
+  // stream passes the decoder's heuristics for enabling frame parallel mode),
+  // then this call will return kStatusTryAgain if an enqueued frame is not yet
+  // decoded (it is a non blocking call in this case). In all other cases, this
+  // call will block until an enqueued frame has been decoded.
+  StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+
+  // Signals the end of stream.
+  //
+  // In non-frame-parallel mode, this function will release all the frames held
+  // by the decoder. If the frame buffers were allocated by libgav1, then the
+  // pointer obtained by the prior DequeueFrame call will no longer be valid. If
+  // the frame buffers were allocated by the application, then any references
+  // that libgav1 is holding on to will be released.
+  //
+  // Once this function returns successfully, the decoder state will be reset
+  // and the decoder is ready to start decoding a new coded video sequence.
+  StatusCode SignalEOS();
+
+  // Returns the maximum bitdepth that is supported by this decoder.
+  static int GetMaxBitdepth();
+
+ private:
+  DecoderSettings settings_;
+  // The object is initialized if and only if impl_ != nullptr.
+  std::unique_ptr<DecoderImpl> impl_;
+};
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_DECODER_H_
diff --git a/src/gav1/decoder_buffer.h b/src/gav1/decoder_buffer.h
new file mode 100644
index 0000000..37bcb29
--- /dev/null
+++ b/src/gav1/decoder_buffer.h
@@ -0,0 +1,279 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+// The documentation for the enum values in this file can be found in Section
+// 6.4.2 of the AV1 spec.
+
+typedef enum Libgav1ChromaSamplePosition {
+  kLibgav1ChromaSamplePositionUnknown,
+  kLibgav1ChromaSamplePositionVertical,
+  kLibgav1ChromaSamplePositionColocated,
+  kLibgav1ChromaSamplePositionReserved
+} Libgav1ChromaSamplePosition;
+
+typedef enum Libgav1ImageFormat {
+  kLibgav1ImageFormatYuv420,
+  kLibgav1ImageFormatYuv422,
+  kLibgav1ImageFormatYuv444,
+  kLibgav1ImageFormatMonochrome400
+} Libgav1ImageFormat;
+
+typedef enum Libgav1ColorPrimary {
+  // 0 is reserved.
+  kLibgav1ColorPrimaryBt709 = 1,
+  kLibgav1ColorPrimaryUnspecified,
+  // 3 is reserved.
+  kLibgav1ColorPrimaryBt470M = 4,
+  kLibgav1ColorPrimaryBt470Bg,
+  kLibgav1ColorPrimaryBt601,
+  kLibgav1ColorPrimarySmpte240,
+  kLibgav1ColorPrimaryGenericFilm,
+  kLibgav1ColorPrimaryBt2020,
+  kLibgav1ColorPrimaryXyz,
+  kLibgav1ColorPrimarySmpte431,
+  kLibgav1ColorPrimarySmpte432,
+  // 13-21 are reserved.
+  kLibgav1ColorPrimaryEbu3213 = 22,
+  // 23-254 are reserved.
+  kLibgav1MaxColorPrimaries = 255
+} Libgav1ColorPrimary;
+
+typedef enum Libgav1TransferCharacteristics {
+  // 0 is reserved.
+  kLibgav1TransferCharacteristicsBt709 = 1,
+  kLibgav1TransferCharacteristicsUnspecified,
+  // 3 is reserved.
+  kLibgav1TransferCharacteristicsBt470M = 4,
+  kLibgav1TransferCharacteristicsBt470Bg,
+  kLibgav1TransferCharacteristicsBt601,
+  kLibgav1TransferCharacteristicsSmpte240,
+  kLibgav1TransferCharacteristicsLinear,
+  kLibgav1TransferCharacteristicsLog100,
+  kLibgav1TransferCharacteristicsLog100Sqrt10,
+  kLibgav1TransferCharacteristicsIec61966,
+  kLibgav1TransferCharacteristicsBt1361,
+  kLibgav1TransferCharacteristicsSrgb,
+  kLibgav1TransferCharacteristicsBt2020TenBit,
+  kLibgav1TransferCharacteristicsBt2020TwelveBit,
+  kLibgav1TransferCharacteristicsSmpte2084,
+  kLibgav1TransferCharacteristicsSmpte428,
+  kLibgav1TransferCharacteristicsHlg,
+  // 19-254 are reserved.
+  kLibgav1MaxTransferCharacteristics = 255
+} Libgav1TransferCharacteristics;
+
+typedef enum Libgav1MatrixCoefficients {
+  kLibgav1MatrixCoefficientsIdentity,
+  kLibgav1MatrixCoefficientsBt709,
+  kLibgav1MatrixCoefficientsUnspecified,
+  // 3 is reserved.
+  kLibgav1MatrixCoefficientsFcc = 4,
+  kLibgav1MatrixCoefficientsBt470BG,
+  kLibgav1MatrixCoefficientsBt601,
+  kLibgav1MatrixCoefficientsSmpte240,
+  kLibgav1MatrixCoefficientsSmpteYcgco,
+  kLibgav1MatrixCoefficientsBt2020Ncl,
+  kLibgav1MatrixCoefficientsBt2020Cl,
+  kLibgav1MatrixCoefficientsSmpte2085,
+  kLibgav1MatrixCoefficientsChromatNcl,
+  kLibgav1MatrixCoefficientsChromatCl,
+  kLibgav1MatrixCoefficientsIctcp,
+  // 15-254 are reserved.
+  kLibgav1MaxMatrixCoefficients = 255
+} Libgav1MatrixCoefficients;
+
+typedef enum Libgav1ColorRange {
+  // The color ranges are scaled by value << (bitdepth - 8) for 10 and 12bit
+  // streams.
+  kLibgav1ColorRangeStudio,  // Y [16..235], UV [16..240]
+  kLibgav1ColorRangeFull     // YUV/RGB [0..255]
+} Libgav1ColorRange;
+
+typedef struct Libgav1DecoderBuffer {
+#if defined(__cplusplus)
+  LIBGAV1_PUBLIC int NumPlanes() const {
+    return (image_format == kLibgav1ImageFormatMonochrome400) ? 1 : 3;
+  }
+#endif  // defined(__cplusplus)
+
+  Libgav1ChromaSamplePosition chroma_sample_position;
+  Libgav1ImageFormat image_format;
+  Libgav1ColorRange color_range;
+  Libgav1ColorPrimary color_primary;
+  Libgav1TransferCharacteristics transfer_characteristics;
+  Libgav1MatrixCoefficients matrix_coefficients;
+
+  // Image storage dimensions.
+  // NOTE: These fields are named w and h in vpx_image_t and aom_image_t.
+  // uint32_t width;  // Stored image width.
+  // uint32_t height;  // Stored image height.
+  int bitdepth;  // Stored image bitdepth.
+
+  // Image display dimensions.
+  // NOTES:
+  // 1. These fields are named d_w and d_h in vpx_image_t and aom_image_t.
+  // 2. libvpx and libaom clients use d_w and d_h much more often than w and h.
+  // 3. These fields can just be stored for the Y plane and the clients can
+  //    calculate the values for the U and V planes if the image format or
+  //    subsampling is exposed.
+  int displayed_width[3];   // Displayed image width.
+  int displayed_height[3];  // Displayed image height.
+
+  int stride[3];
+  uint8_t* plane[3];
+
+  // Spatial id of this frame.
+  int spatial_id;
+  // Temporal id of this frame.
+  int temporal_id;
+
+  // The |user_private_data| argument passed to Decoder::EnqueueFrame().
+  int64_t user_private_data;
+  // The |private_data| field of FrameBuffer. Set by the get frame buffer
+  // callback when it allocates a frame buffer.
+  void* buffer_private_data;
+} Libgav1DecoderBuffer;
+
+#if defined(__cplusplus)
+namespace libgav1 {
+
+using ChromaSamplePosition = Libgav1ChromaSamplePosition;
+constexpr ChromaSamplePosition kChromaSamplePositionUnknown =
+    kLibgav1ChromaSamplePositionUnknown;
+constexpr ChromaSamplePosition kChromaSamplePositionVertical =
+    kLibgav1ChromaSamplePositionVertical;
+constexpr ChromaSamplePosition kChromaSamplePositionColocated =
+    kLibgav1ChromaSamplePositionColocated;
+constexpr ChromaSamplePosition kChromaSamplePositionReserved =
+    kLibgav1ChromaSamplePositionReserved;
+
+using ImageFormat = Libgav1ImageFormat;
+constexpr ImageFormat kImageFormatYuv420 = kLibgav1ImageFormatYuv420;
+constexpr ImageFormat kImageFormatYuv422 = kLibgav1ImageFormatYuv422;
+constexpr ImageFormat kImageFormatYuv444 = kLibgav1ImageFormatYuv444;
+constexpr ImageFormat kImageFormatMonochrome400 =
+    kLibgav1ImageFormatMonochrome400;
+
+using ColorPrimary = Libgav1ColorPrimary;
+constexpr ColorPrimary kColorPrimaryBt709 = kLibgav1ColorPrimaryBt709;
+constexpr ColorPrimary kColorPrimaryUnspecified =
+    kLibgav1ColorPrimaryUnspecified;
+constexpr ColorPrimary kColorPrimaryBt470M = kLibgav1ColorPrimaryBt470M;
+constexpr ColorPrimary kColorPrimaryBt470Bg = kLibgav1ColorPrimaryBt470Bg;
+constexpr ColorPrimary kColorPrimaryBt601 = kLibgav1ColorPrimaryBt601;
+constexpr ColorPrimary kColorPrimarySmpte240 = kLibgav1ColorPrimarySmpte240;
+constexpr ColorPrimary kColorPrimaryGenericFilm =
+    kLibgav1ColorPrimaryGenericFilm;
+constexpr ColorPrimary kColorPrimaryBt2020 = kLibgav1ColorPrimaryBt2020;
+constexpr ColorPrimary kColorPrimaryXyz = kLibgav1ColorPrimaryXyz;
+constexpr ColorPrimary kColorPrimarySmpte431 = kLibgav1ColorPrimarySmpte431;
+constexpr ColorPrimary kColorPrimarySmpte432 = kLibgav1ColorPrimarySmpte432;
+constexpr ColorPrimary kColorPrimaryEbu3213 = kLibgav1ColorPrimaryEbu3213;
+constexpr ColorPrimary kMaxColorPrimaries = kLibgav1MaxColorPrimaries;
+
+using TransferCharacteristics = Libgav1TransferCharacteristics;
+constexpr TransferCharacteristics kTransferCharacteristicsBt709 =
+    kLibgav1TransferCharacteristicsBt709;
+constexpr TransferCharacteristics kTransferCharacteristicsUnspecified =
+    kLibgav1TransferCharacteristicsUnspecified;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470M =
+    kLibgav1TransferCharacteristicsBt470M;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470Bg =
+    kLibgav1TransferCharacteristicsBt470Bg;
+constexpr TransferCharacteristics kTransferCharacteristicsBt601 =
+    kLibgav1TransferCharacteristicsBt601;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte240 =
+    kLibgav1TransferCharacteristicsSmpte240;
+constexpr TransferCharacteristics kTransferCharacteristicsLinear =
+    kLibgav1TransferCharacteristicsLinear;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100 =
+    kLibgav1TransferCharacteristicsLog100;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100Sqrt10 =
+    kLibgav1TransferCharacteristicsLog100Sqrt10;
+constexpr TransferCharacteristics kTransferCharacteristicsIec61966 =
+    kLibgav1TransferCharacteristicsIec61966;
+constexpr TransferCharacteristics kTransferCharacteristicsBt1361 =
+    kLibgav1TransferCharacteristicsBt1361;
+constexpr TransferCharacteristics kTransferCharacteristicsSrgb =
+    kLibgav1TransferCharacteristicsSrgb;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TenBit =
+    kLibgav1TransferCharacteristicsBt2020TenBit;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TwelveBit =
+    kLibgav1TransferCharacteristicsBt2020TwelveBit;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte2084 =
+    kLibgav1TransferCharacteristicsSmpte2084;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte428 =
+    kLibgav1TransferCharacteristicsSmpte428;
+constexpr TransferCharacteristics kTransferCharacteristicsHlg =
+    kLibgav1TransferCharacteristicsHlg;
+constexpr TransferCharacteristics kMaxTransferCharacteristics =
+    kLibgav1MaxTransferCharacteristics;
+
+using MatrixCoefficients = Libgav1MatrixCoefficients;
+constexpr MatrixCoefficients kMatrixCoefficientsIdentity =
+    kLibgav1MatrixCoefficientsIdentity;
+constexpr MatrixCoefficients kMatrixCoefficientsBt709 =
+    kLibgav1MatrixCoefficientsBt709;
+constexpr MatrixCoefficients kMatrixCoefficientsUnspecified =
+    kLibgav1MatrixCoefficientsUnspecified;
+constexpr MatrixCoefficients kMatrixCoefficientsFcc =
+    kLibgav1MatrixCoefficientsFcc;
+constexpr MatrixCoefficients kMatrixCoefficientsBt470BG =
+    kLibgav1MatrixCoefficientsBt470BG;
+constexpr MatrixCoefficients kMatrixCoefficientsBt601 =
+    kLibgav1MatrixCoefficientsBt601;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte240 =
+    kLibgav1MatrixCoefficientsSmpte240;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpteYcgco =
+    kLibgav1MatrixCoefficientsSmpteYcgco;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Ncl =
+    kLibgav1MatrixCoefficientsBt2020Ncl;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Cl =
+    kLibgav1MatrixCoefficientsBt2020Cl;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte2085 =
+    kLibgav1MatrixCoefficientsSmpte2085;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatNcl =
+    kLibgav1MatrixCoefficientsChromatNcl;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatCl =
+    kLibgav1MatrixCoefficientsChromatCl;
+constexpr MatrixCoefficients kMatrixCoefficientsIctcp =
+    kLibgav1MatrixCoefficientsIctcp;
+constexpr MatrixCoefficients kMaxMatrixCoefficients =
+    kLibgav1MaxMatrixCoefficients;
+
+using ColorRange = Libgav1ColorRange;
+constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio;
+constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull;
+
+using DecoderBuffer = Libgav1DecoderBuffer;
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
diff --git a/src/gav1/decoder_settings.h b/src/gav1/decoder_settings.h
new file mode 100644
index 0000000..ab22a4d
--- /dev/null
+++ b/src/gav1/decoder_settings.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+#define LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/frame_buffer.h"
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This callback is invoked by the decoder when it is done using an input frame
+// buffer. When frame_parallel is set to true, this callback must not be
+// nullptr. Otherwise, this callback is optional.
+//
+// |buffer_private_data| is the value passed in the EnqueueFrame() call.
+typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data,
+                                                  void* buffer_private_data);
+
+typedef struct Libgav1DecoderSettings {
+  // Number of threads to use when decoding. Must be greater than 0. The library
+  // will create at most |threads| new threads. Defaults to 1 (no new threads
+  // will be created).
+  int threads;
+  // A boolean. Indicate to the decoder that frame parallel decoding is allowed.
+  // Note that this is just a request and the decoder will decide the number of
+  // frames to be decoded in parallel based on the video stream being decoded.
+  int frame_parallel;
+  // A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait
+  // until a enqueued frame is available for dequeueing.
+  //
+  // If frame_parallel is 0, this setting is ignored.
+  int blocking_dequeue;
+  // Called when the first sequence header or a sequence header with a
+  // different frame size (which includes bitdepth, monochrome, subsampling_x,
+  // subsampling_y, maximum frame width, or maximum frame height) is received.
+  Libgav1FrameBufferSizeChangedCallback on_frame_buffer_size_changed;
+  // Get frame buffer callback.
+  Libgav1GetFrameBufferCallback get_frame_buffer;
+  // Release frame buffer callback.
+  Libgav1ReleaseFrameBufferCallback release_frame_buffer;
+  // Release input frame buffer callback.
+  Libgav1ReleaseInputBufferCallback release_input_buffer;
+  // Passed as the private_data argument to the callbacks.
+  void* callback_private_data;
+  // A boolean. If set to 1, the decoder will output all the spatial and
+  // temporal layers.
+  int output_all_layers;
+  // Index of the operating point to decode.
+  int operating_point;
+  // Mask indicating the post processing filters that need to be applied to the
+  // reconstructed frame. Note this is an advanced setting and does not
+  // typically need to be changed.
+  // From LSB:
+  //   Bit 0: Loop filter (deblocking filter).
+  //   Bit 1: Cdef.
+  //   Bit 2: SuperRes.
+  //   Bit 3: Loop restoration.
+  //   Bit 4: Film grain synthesis.
+  //   All the bits other than the last 5 are ignored.
+  uint8_t post_filter_mask;
+} Libgav1DecoderSettings;
+
+LIBGAV1_PUBLIC void Libgav1DecoderSettingsInitDefault(
+    Libgav1DecoderSettings* settings);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback;
+
+// Applications must populate this structure before creating a decoder instance.
+struct DecoderSettings {
+  // Number of threads to use when decoding. Must be greater than 0. The library
+  // will create at most |threads| new threads. Defaults to 1 (no new threads
+  // will be created).
+  int threads = 1;
+  // Indicate to the decoder that frame parallel decoding is allowed. Note that
+  // this is just a request and the decoder will decide the number of frames to
+  // be decoded in parallel based on the video stream being decoded.
+  bool frame_parallel = false;
+  // In frame parallel mode, should DequeueFrame wait until a enqueued frame is
+  // available for dequeueing.
+  //
+  // If frame_parallel is false, this setting is ignored.
+  bool blocking_dequeue = false;
+  // Called when the first sequence header or a sequence header with a
+  // different frame size (which includes bitdepth, monochrome, subsampling_x,
+  // subsampling_y, maximum frame width, or maximum frame height) is received.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+  // Get frame buffer callback.
+  GetFrameBufferCallback get_frame_buffer = nullptr;
+  // Release frame buffer callback.
+  ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+  // Release input frame buffer callback.
+  ReleaseInputBufferCallback release_input_buffer = nullptr;
+  // Passed as the private_data argument to the callbacks.
+  void* callback_private_data = nullptr;
+  // If set to true, the decoder will output all the spatial and temporal
+  // layers.
+  bool output_all_layers = false;
+  // Index of the operating point to decode.
+  int operating_point = 0;
+  // Mask indicating the post processing filters that need to be applied to the
+  // reconstructed frame. Note this is an advanced setting and does not
+  // typically need to be changed.
+  // From LSB:
+  //   Bit 0: Loop filter (deblocking filter).
+  //   Bit 1: Cdef.
+  //   Bit 2: SuperRes.
+  //   Bit 3: Loop restoration.
+  //   Bit 4: Film grain synthesis.
+  //   All the bits other than the last 5 are ignored.
+  uint8_t post_filter_mask = 0x1f;
+};
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+#endif  // LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
diff --git a/src/gav1/frame_buffer.h b/src/gav1/frame_buffer.h
new file mode 100644
index 0000000..8132b61
--- /dev/null
+++ b/src/gav1/frame_buffer.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/decoder_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+
+// The callback functions use the C linkage conventions.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This structure represents an allocated frame buffer.
+typedef struct Libgav1FrameBuffer {
+  // In the |plane| and |stride| arrays, the elements at indexes 0, 1, and 2
+  // are for the Y, U, and V planes, respectively.
+  uint8_t* plane[3];   // Pointers to the frame (excluding the borders) in the
+                       // data buffers.
+  int stride[3];       // Row strides in bytes.
+  void* private_data;  // Frame buffer's private data. Available for use by the
+                       // release frame buffer callback. Also copied to the
+                       // |buffer_private_data| field of DecoderBuffer for use
+                       // by the consumer of a DecoderBuffer.
+} Libgav1FrameBuffer;
+
+// This callback is invoked by the decoder to provide information on the
+// subsequent frames in the video, until the next invocation of this callback
+// or the end of the video.
+//
+// |width| and |height| are the maximum frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// maximum left, right, top, and bottom border sizes in pixels.
+// |stride_alignment| specifies the alignment of the row stride in bytes.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+//
+// NOTE: This callback may be omitted if the information is not useful to the
+// application.
+typedef Libgav1StatusCode (*Libgav1FrameBufferSizeChangedCallback)(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment);
+
+// This callback is invoked by the decoder to allocate a frame buffer, which
+// consists of three data buffers, for the Y, U, and V planes, respectively.
+//
+// The callback must set |frame_buffer->plane[i]| to point to the data buffers
+// of the planes, and set |frame_buffer->stride[i]| to the row strides of the
+// planes. If |image_format| is kLibgav1ImageFormatMonochrome400, the callback
+// should set |frame_buffer->plane[1]| and |frame_buffer->plane[2]| to a null
+// pointer and set |frame_buffer->stride[1]| and |frame_buffer->stride[2]| to
+// 0. The callback may set |frame_buffer->private_data| to a value that will
+// be useful to the release frame buffer callback and the consumer of a
+// DecoderBuffer.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+
+// |width| and |height| are the frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// left, right, top, and bottom border sizes in pixels. |stride_alignment|
+// specifies the alignment of the row stride in bytes.
+typedef Libgav1StatusCode (*Libgav1GetFrameBufferCallback)(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+// After a frame buffer is allocated, the decoder starts to write decoded video
+// to the frame buffer. When the frame buffer is ready for consumption, it is
+// made available to the application in a Decoder::DequeueFrame() call.
+// Afterwards, the decoder may continue to use the frame buffer in read-only
+// mode. When the decoder is finished using the frame buffer, it notifies the
+// application by calling the Libgav1ReleaseFrameBufferCallback.
+
+// This callback is invoked by the decoder to release a frame buffer.
+typedef void (*Libgav1ReleaseFrameBufferCallback)(void* callback_private_data,
+                                                  void* buffer_private_data);
+
+// Libgav1ComputeFrameBufferInfo() and Libgav1SetFrameBuffer() are intended to
+// help clients implement frame buffer callbacks using memory buffers. First,
+// call Libgav1ComputeFrameBufferInfo(). If it succeeds, allocate y_buffer of
+// size info.y_buffer_size and allocate u_buffer and v_buffer, both of size
+// info.uv_buffer_size. Finally, pass y_buffer, u_buffer, v_buffer, and
+// buffer_private_data to Libgav1SetFrameBuffer().
+
+// This structure contains information useful for allocating memory for a frame
+// buffer.
+typedef struct Libgav1FrameBufferInfo {
+  size_t y_buffer_size;   // Size in bytes of the Y buffer.
+  size_t uv_buffer_size;  // Size in bytes of the U or V buffer.
+
+  // The following fields are consumed by Libgav1SetFrameBuffer(). Do not use
+  // them directly.
+  int y_stride;            // Row stride in bytes of the Y buffer.
+  int uv_stride;           // Row stride in bytes of the U or V buffer.
+  size_t y_plane_offset;   // Offset in bytes of the frame (excluding the
+                           // borders) in the Y buffer.
+  size_t uv_plane_offset;  // Offset in bytes of the frame (excluding the
+                           // borders) in the U or V buffer.
+  int stride_alignment;    // The stride_alignment argument passed to
+                           // Libgav1ComputeFrameBufferInfo().
+} Libgav1FrameBufferInfo;
+
+// Computes the information useful for allocating memory for a frame buffer.
+// On success, stores the output in |info|.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBufferInfo* info);
+
+// Sets the |frame_buffer| struct.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1SetFrameBuffer(
+    const Libgav1FrameBufferInfo* info, uint8_t* y_buffer, uint8_t* u_buffer,
+    uint8_t* v_buffer, void* buffer_private_data,
+    Libgav1FrameBuffer* frame_buffer);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+// Declare type aliases for C++.
+namespace libgav1 {
+
+using FrameBuffer = Libgav1FrameBuffer;
+using FrameBufferSizeChangedCallback = Libgav1FrameBufferSizeChangedCallback;
+using GetFrameBufferCallback = Libgav1GetFrameBufferCallback;
+using ReleaseFrameBufferCallback = Libgav1ReleaseFrameBufferCallback;
+using FrameBufferInfo = Libgav1FrameBufferInfo;
+
+inline StatusCode ComputeFrameBufferInfo(int bitdepth, ImageFormat image_format,
+                                         int width, int height, int left_border,
+                                         int right_border, int top_border,
+                                         int bottom_border,
+                                         int stride_alignment,
+                                         FrameBufferInfo* info) {
+  return Libgav1ComputeFrameBufferInfo(bitdepth, image_format, width, height,
+                                       left_border, right_border, top_border,
+                                       bottom_border, stride_alignment, info);
+}
+
+inline StatusCode SetFrameBuffer(const FrameBufferInfo* info, uint8_t* y_buffer,
+                                 uint8_t* u_buffer, uint8_t* v_buffer,
+                                 void* buffer_private_data,
+                                 FrameBuffer* frame_buffer) {
+  return Libgav1SetFrameBuffer(info, y_buffer, u_buffer, v_buffer,
+                               buffer_private_data, frame_buffer);
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
diff --git a/src/gav1/status_code.h b/src/gav1/status_code.h
new file mode 100644
index 0000000..d7476ca
--- /dev/null
+++ b/src/gav1/status_code.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+#define LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+// The Libgav1StatusCode enum type: A libgav1 function may return
+// Libgav1StatusCode to indicate success or the reason for failure.
+typedef enum {
+  // Success.
+  kLibgav1StatusOk = 0,
+
+  // An unknown error. Used as the default error status if error detail is not
+  // available.
+  kLibgav1StatusUnknownError = -1,
+
+  // An invalid function argument.
+  kLibgav1StatusInvalidArgument = -2,
+
+  // Memory allocation failure.
+  kLibgav1StatusOutOfMemory = -3,
+
+  // Ran out of a resource (other than memory).
+  kLibgav1StatusResourceExhausted = -4,
+
+  // The object is not initialized.
+  kLibgav1StatusNotInitialized = -5,
+
+  // An operation that can only be performed once has already been performed.
+  kLibgav1StatusAlready = -6,
+
+  // Not implemented, or not supported.
+  kLibgav1StatusUnimplemented = -7,
+
+  // An internal error in libgav1. Usually this indicates a programming error.
+  kLibgav1StatusInternalError = -8,
+
+  // The bitstream is not encoded correctly or violates a bitstream conformance
+  // requirement.
+  kLibgav1StatusBitstreamError = -9,
+
+  // The operation is not allowed at the moment. This is not a fatal error. Try
+  // again later.
+  kLibgav1StatusTryAgain = -10,
+
+  // Used only by DequeueFrame(). There are no enqueued frames, so there is
+  // nothing to dequeue. This is not a fatal error. Try enqueuing a frame before
+  // trying to dequeue again.
+  kLibgav1StatusNothingToDequeue = -11,
+
+  // An extra enumerator to prevent people from writing code that fails to
+  // compile when a new status code is added.
+  //
+  // Do not reference this enumerator. In particular, if you write code that
+  // switches on Libgav1StatusCode, add a default: case instead of a case that
+  // mentions this enumerator.
+  //
+  // Do not depend on the value (currently -1000) listed here. It may change in
+  // the future.
+  kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_ = -1000
+} Libgav1StatusCode;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetErrorString(Libgav1StatusCode status);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Declare type aliases for C++.
+using StatusCode = Libgav1StatusCode;
+constexpr StatusCode kStatusOk = kLibgav1StatusOk;
+constexpr StatusCode kStatusUnknownError = kLibgav1StatusUnknownError;
+constexpr StatusCode kStatusInvalidArgument = kLibgav1StatusInvalidArgument;
+constexpr StatusCode kStatusOutOfMemory = kLibgav1StatusOutOfMemory;
+constexpr StatusCode kStatusResourceExhausted = kLibgav1StatusResourceExhausted;
+constexpr StatusCode kStatusNotInitialized = kLibgav1StatusNotInitialized;
+constexpr StatusCode kStatusAlready = kLibgav1StatusAlready;
+constexpr StatusCode kStatusUnimplemented = kLibgav1StatusUnimplemented;
+constexpr StatusCode kStatusInternalError = kLibgav1StatusInternalError;
+constexpr StatusCode kStatusBitstreamError = kLibgav1StatusBitstreamError;
+constexpr StatusCode kStatusTryAgain = kLibgav1StatusTryAgain;
+constexpr StatusCode kStatusNothingToDequeue = kLibgav1StatusNothingToDequeue;
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+inline const char* GetErrorString(StatusCode status) {
+  return Libgav1GetErrorString(status);
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_STATUS_CODE_H_
diff --git a/src/gav1/symbol_visibility.h b/src/gav1/symbol_visibility.h
new file mode 100644
index 0000000..ad7498c
--- /dev/null
+++ b/src/gav1/symbol_visibility.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+#define LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+
+// This module defines the LIBGAV1_PUBLIC macro. LIBGAV1_PUBLIC, when combined
+// with the flags -fvisibility=hidden and -fvisibility-inlines-hidden, restricts
+// symbol availability when users use the shared object form of libgav1. The
+// intent is to prevent exposure of libgav1 internals to users of the library,
+// and to avoid ABI compatibility problems that changes to libgav1 internals
+// would cause for users of the libgav1 shared object.
+//
+// Examples:
+//
+// This form makes a class and all of its members part of the public API:
+//
+// class LIBGAV1_PUBLIC A {
+//  public:
+//   A();
+//   ~A();
+//   void Foo();
+//   int Bar();
+// };
+//
+// A::A(), A::~A(), A::Foo(), and A::Bar() are all available to code linking to
+// the shared object when this form is used.
+//
+// This form exposes a single class method as part of the public API:
+//
+// class B {
+//  public:
+//   B();
+//   ~B();
+//   LIBGAV1_PUBLIC int Foo();
+// };
+//
+// In this examples only B::Foo() is available to the user of the shared object.
+//
+// Non-class member functions can also be exposed individually:
+//
+// LIBGAV1_PUBLIC void Bar();
+//
+// In this example Bar() would be available to users of the shared object.
+//
+// Much of the above information and more can be found at
+// https://gcc.gnu.org/wiki/Visibility
+
+#if !defined(LIBGAV1_PUBLIC)
+#if defined(_WIN32)
+#if defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#if defined(__GNUC__)
+#define LIBGAV1_PUBLIC __attribute__((dllexport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllexport)
+#endif  // defined(__GNUC__)
+#elif defined(LIBGAV1_BUILDING_DLL)
+#ifdef __GNUC__
+#define LIBGAV1_PUBLIC __attribute__((dllimport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllimport)
+#endif  // defined(__GNUC__)
+#else
+#define LIBGAV1_PUBLIC
+#endif  // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#else
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define LIBGAV1_PUBLIC __attribute__((visibility("default")))
+#else
+#define LIBGAV1_PUBLIC
+#endif
+#endif  // defined(_WIN32)
+#endif  // defined(LIBGAV1_PUBLIC)
+
+#endif  // LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
diff --git a/src/gav1/version.h b/src/gav1/version.h
new file mode 100644
index 0000000..78a573e
--- /dev/null
+++ b/src/gav1/version.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_VERSION_H_
+#define LIBGAV1_SRC_GAV1_VERSION_H_
+
+#include "gav1/symbol_visibility.h"
+
+// This library follows the principles described by Semantic Versioning
+// (https://semver.org).
+
+#define LIBGAV1_MAJOR_VERSION 0
+#define LIBGAV1_MINOR_VERSION 16
+#define LIBGAV1_PATCH_VERSION 1
+
+#define LIBGAV1_VERSION                                           \
+  ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \
+   LIBGAV1_PATCH_VERSION)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+LIBGAV1_PUBLIC int Libgav1GetVersion(void);
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetVersionString(void);
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetBuildConfiguration(void);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+inline int GetVersion() { return Libgav1GetVersion(); }
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+inline const char* GetVersionString() { return Libgav1GetVersionString(); }
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+inline const char* GetBuildConfiguration() {
+  return Libgav1GetBuildConfiguration();
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_VERSION_H_
diff --git a/src/inter_intra_masks.inc b/src/inter_intra_masks.inc
new file mode 100644
index 0000000..2c15f9c
--- /dev/null
+++ b/src/inter_intra_masks.inc
@@ -0,0 +1,581 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the inter intra masks
+// from the code where it is used.
+
+// The tables in this file are computed based on section 7.11.3.13 in the spec.
+
+constexpr uint8_t kInterIntraMaskDc[] = {
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+constexpr uint8_t kInterIntraMaskVertical4x4[] = {
+    60, 60, 60, 60, 19, 19, 19, 19, 6, 6, 6, 6, 2, 2, 2, 2};
+constexpr uint8_t kInterIntraMaskVertical4x8[] = {
+    60, 60, 60, 60, 34, 34, 34, 34, 19, 19, 19, 19, 11, 11, 11, 11,
+    6,  6,  6,  6,  4,  4,  4,  4,  2,  2,  2,  2,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical8x4[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+    19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskVertical8x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+    19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11,
+    6,  6,  6,  6,  6,  6,  6,  6,  4,  4,  4,  4,  4,  4,  4,  4,
+    2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical8x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34,
+    34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19,
+    19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11, 11, 11, 11, 8,
+    8,  8,  8,  8,  8,  8,  8,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,
+    5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,
+    3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical16x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8};
+constexpr uint8_t kInterIntraMaskVertical16x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical16x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical32x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7};
+constexpr uint8_t kInterIntraMaskVertical32x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+
+constexpr uint8_t kInterIntraMaskHorizontal4x4[] = {60, 19, 6, 2, 60, 19, 6, 2,
+                                                    60, 19, 6, 2, 60, 19, 6, 2};
+constexpr uint8_t kInterIntraMaskHorizontal4x8[] = {
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11,
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskHorizontal8x4[] = {
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x8[] = {
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x16[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34,
+    26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15,
+    11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60,
+    45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26,
+    19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11,
+    8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45,
+    34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskHorizontal16x8[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34,
+    26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15,
+    11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,
+    5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,
+    2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,
+    1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45,
+    34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal16x16[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34,
+    26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15,
+    11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,
+    5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,
+    2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,
+    1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45,
+    34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19,
+    15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,
+    6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,
+    3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,
+    1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60,
+    45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26,
+    19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11,
+    8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal16x32[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7};
+constexpr uint8_t kInterIntraMaskHorizontal32x16[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,
+    3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,
+    1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal32x32[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,
+    3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,
+    1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,
+    4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,
+    2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+
+constexpr uint8_t kInterIntraMaskSmooth4x4[] = {60, 60, 60, 60, 60, 19, 19, 19,
+                                                60, 19, 6,  6,  60, 19, 6,  2};
+constexpr uint8_t kInterIntraMaskSmooth4x8[] = {
+    60, 60, 60, 60, 60, 34, 34, 34, 60, 34, 19, 19, 60, 34, 19, 11,
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x4[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+    60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+    60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11,
+    60, 34, 19, 11, 6,  6,  6,  6,  60, 34, 19, 11, 6,  4,  4,  4,
+    60, 34, 19, 11, 6,  4,  2,  2,  60, 34, 19, 11, 6,  4,  2,  1};
+constexpr uint8_t kInterIntraMaskSmooth8x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34,
+    34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19,
+    19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 60, 45, 34, 26, 19, 15, 11, 11, 60,
+    45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26,
+    19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11,
+    8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45,
+    34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskSmooth16x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+    34, 26, 19, 15, 11, 8,  8,  8,  8,  8,  8,  8,  8,  8};
+constexpr uint8_t kInterIntraMaskSmooth16x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+    34, 26, 19, 15, 11, 8,  8,  8,  8,  8,  8,  8,  8,  8,  60, 45, 34, 26, 19,
+    15, 11, 8,  6,  6,  6,  6,  6,  6,  6,  6,  60, 45, 34, 26, 19, 15, 11, 8,
+    6,  5,  5,  5,  5,  5,  5,  5,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,
+    4,  4,  4,  4,  4,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  3,  3,
+    3,  3,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  2,  2,  60,
+    45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  2,  2,  60, 45, 34, 26,
+    19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11,
+    8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskSmooth16x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 52, 45, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52,
+    45, 39, 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34,
+    30, 26, 22, 19, 19, 19, 19, 19, 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 13, 13,
+    13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11, 11, 60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  8,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7};
+constexpr uint8_t kInterIntraMaskSmooth32x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+    34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7};
+constexpr uint8_t kInterIntraMaskSmooth32x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+    34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+
+// For each 2D array within this array, the indices are mapped as follows: 0, 1,
+// 2 and 3 in each dimension maps to prediction dimension 4, 8, 16 and 32
+// respectively. For example, the entry in [1][2] corresponds to a prediction
+// size of 8x16 (width == 8 and height == 16).
+const uint8_t* kInterIntraMasks[kNumInterIntraModes][4][4] = {
+    // kInterIntraModeDc. This is a special case where all the non-nullptr
+    // entries point to kInterIntraMaskDc (all entries of the array are 32). The
+    // width can be set according to the prediction size to achieve the desired
+    // result.
+    {{kInterIntraMaskDc, kInterIntraMaskDc, nullptr, nullptr},
+     {kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc, nullptr},
+     {nullptr, kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc},
+     {nullptr, nullptr, kInterIntraMaskDc, kInterIntraMaskDc}},
+    // kInterIntraModeVertical
+    {{kInterIntraMaskVertical4x4, kInterIntraMaskVertical4x8, nullptr, nullptr},
+     {kInterIntraMaskVertical8x4, kInterIntraMaskVertical8x8,
+      kInterIntraMaskVertical8x16, nullptr},
+     {nullptr, kInterIntraMaskVertical16x8, kInterIntraMaskVertical16x16,
+      kInterIntraMaskVertical16x32},
+     {nullptr, nullptr, kInterIntraMaskVertical32x16,
+      kInterIntraMaskVertical32x32}},
+    // kInterIntraModeHorizontal
+    {{kInterIntraMaskHorizontal4x4, kInterIntraMaskHorizontal4x8, nullptr,
+      nullptr},
+     {kInterIntraMaskHorizontal8x4, kInterIntraMaskHorizontal8x8,
+      kInterIntraMaskHorizontal8x16, nullptr},
+     {nullptr, kInterIntraMaskHorizontal16x8, kInterIntraMaskHorizontal16x16,
+      kInterIntraMaskHorizontal16x32},
+     {nullptr, nullptr, kInterIntraMaskHorizontal32x16,
+      kInterIntraMaskHorizontal32x32}},
+    // kInterIntraModeSmooth
+    {{kInterIntraMaskSmooth4x4, kInterIntraMaskSmooth4x8, nullptr, nullptr},
+     {kInterIntraMaskSmooth8x4, kInterIntraMaskSmooth8x8,
+      kInterIntraMaskSmooth8x16, nullptr},
+     {nullptr, kInterIntraMaskSmooth16x8, kInterIntraMaskSmooth16x16,
+      kInterIntraMaskSmooth16x32},
+     {nullptr, nullptr, kInterIntraMaskSmooth32x16,
+      kInterIntraMaskSmooth32x32}}};
diff --git a/src/internal_frame_buffer_list.cc b/src/internal_frame_buffer_list.cc
new file mode 100644
index 0000000..e2d2273
--- /dev/null
+++ b/src/internal_frame_buffer_list.cc
@@ -0,0 +1,122 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/internal_frame_buffer_list.h"
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+extern "C" {
+
+Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  return buffer_list->OnFrameBufferSizeChanged(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment);
+}
+
+Libgav1StatusCode GetInternalFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  return buffer_list->GetFrameBuffer(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void ReleaseInternalFrameBuffer(void* callback_private_data,
+                                void* buffer_private_data) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  buffer_list->ReleaseFrameBuffer(buffer_private_data);
+}
+
+}  // extern "C"
+
+StatusCode InternalFrameBufferList::OnFrameBufferSizeChanged(
+    int /*bitdepth*/, Libgav1ImageFormat /*image_format*/, int /*width*/,
+    int /*height*/, int /*left_border*/, int /*right_border*/,
+    int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/) {
+  return kStatusOk;
+}
+
+StatusCode InternalFrameBufferList::GetFrameBuffer(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  FrameBufferInfo info;
+  StatusCode status = ComputeFrameBufferInfo(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, &info);
+  if (status != kStatusOk) return status;
+
+  if (info.uv_buffer_size > SIZE_MAX / 2 ||
+      info.y_buffer_size > SIZE_MAX - 2 * info.uv_buffer_size) {
+    return kStatusInvalidArgument;
+  }
+  const size_t min_size = info.y_buffer_size + 2 * info.uv_buffer_size;
+
+  Buffer* buffer = nullptr;
+  for (auto& buffer_ptr : buffers_) {
+    if (!buffer_ptr->in_use) {
+      buffer = buffer_ptr.get();
+      break;
+    }
+  }
+  if (buffer == nullptr) {
+    std::unique_ptr<Buffer> new_buffer(new (std::nothrow) Buffer);
+    if (new_buffer == nullptr || !buffers_.push_back(std::move(new_buffer))) {
+      return kStatusOutOfMemory;
+    }
+    buffer = buffers_.back().get();
+  }
+
+  if (buffer->size < min_size) {
+    std::unique_ptr<uint8_t[], MallocDeleter> new_data(
+        static_cast<uint8_t*>(malloc(min_size)));
+    if (new_data == nullptr) return kStatusOutOfMemory;
+    buffer->data = std::move(new_data);
+    buffer->size = min_size;
+  }
+
+  uint8_t* const y_buffer = buffer->data.get();
+  uint8_t* const u_buffer =
+      (info.uv_buffer_size == 0) ? nullptr : y_buffer + info.y_buffer_size;
+  uint8_t* const v_buffer =
+      (info.uv_buffer_size == 0) ? nullptr : u_buffer + info.uv_buffer_size;
+  status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer, buffer,
+                                 frame_buffer);
+  if (status != kStatusOk) return status;
+  buffer->in_use = true;
+  return kStatusOk;
+}
+
+void InternalFrameBufferList::ReleaseFrameBuffer(void* buffer_private_data) {
+  auto* const buffer = static_cast<Buffer*>(buffer_private_data);
+  buffer->in_use = false;
+}
+
+}  // namespace libgav1
diff --git a/src/internal_frame_buffer_list.h b/src/internal_frame_buffer_list.h
new file mode 100644
index 0000000..1c50b48
--- /dev/null
+++ b/src/internal_frame_buffer_list.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+#define LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+extern "C" Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment);
+
+extern "C" Libgav1StatusCode GetInternalFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+extern "C" void ReleaseInternalFrameBuffer(void* callback_private_data,
+                                           void* buffer_private_data);
+
+class InternalFrameBufferList : public Allocable {
+ public:
+  InternalFrameBufferList() = default;
+
+  // Not copyable or movable.
+  InternalFrameBufferList(const InternalFrameBufferList&) = delete;
+  InternalFrameBufferList& operator=(const InternalFrameBufferList&) = delete;
+
+  ~InternalFrameBufferList() = default;
+
+  Libgav1StatusCode OnFrameBufferSizeChanged(int bitdepth,
+                                             Libgav1ImageFormat image_format,
+                                             int width, int height,
+                                             int left_border, int right_border,
+                                             int top_border, int bottom_border,
+                                             int stride_alignment);
+
+  Libgav1StatusCode GetFrameBuffer(int bitdepth,
+                                   Libgav1ImageFormat image_format, int width,
+                                   int height, int left_border,
+                                   int right_border, int top_border,
+                                   int bottom_border, int stride_alignment,
+                                   Libgav1FrameBuffer* frame_buffer);
+
+  void ReleaseFrameBuffer(void* buffer_private_data);
+
+ private:
+  struct Buffer : public Allocable {
+    std::unique_ptr<uint8_t[], MallocDeleter> data;
+    size_t size = 0;
+    bool in_use = false;
+  };
+
+  Vector<std::unique_ptr<Buffer>> buffers_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
diff --git a/src/libgav1_decoder.cmake b/src/libgav1_decoder.cmake
new file mode 100644
index 0000000..b97d09d
--- /dev/null
+++ b/src/libgav1_decoder.cmake
@@ -0,0 +1,157 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_)
+  return()
+endif() # LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_
+set(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_ 1)
+
+list(APPEND libgav1_decoder_sources
+            "${libgav1_source}/buffer_pool.cc"
+            "${libgav1_source}/buffer_pool.h"
+            "${libgav1_source}/decoder_impl.cc"
+            "${libgav1_source}/decoder_impl.h"
+            "${libgav1_source}/decoder_state.h"
+            "${libgav1_source}/tile_scratch_buffer.cc"
+            "${libgav1_source}/tile_scratch_buffer.h"
+            "${libgav1_source}/film_grain.cc"
+            "${libgav1_source}/film_grain.h"
+            "${libgav1_source}/frame_buffer.cc"
+            "${libgav1_source}/frame_buffer_utils.h"
+            "${libgav1_source}/frame_scratch_buffer.h"
+            "${libgav1_source}/inter_intra_masks.inc"
+            "${libgav1_source}/internal_frame_buffer_list.cc"
+            "${libgav1_source}/internal_frame_buffer_list.h"
+            "${libgav1_source}/loop_restoration_info.cc"
+            "${libgav1_source}/loop_restoration_info.h"
+            "${libgav1_source}/motion_vector.cc"
+            "${libgav1_source}/motion_vector.h"
+            "${libgav1_source}/obu_parser.cc"
+            "${libgav1_source}/obu_parser.h"
+            "${libgav1_source}/post_filter/cdef.cc"
+            "${libgav1_source}/post_filter/deblock.cc"
+            "${libgav1_source}/post_filter/deblock_thresholds.inc"
+            "${libgav1_source}/post_filter/loop_restoration.cc"
+            "${libgav1_source}/post_filter/post_filter.cc"
+            "${libgav1_source}/post_filter/super_res.cc"
+            "${libgav1_source}/post_filter.h"
+            "${libgav1_source}/prediction_mask.cc"
+            "${libgav1_source}/prediction_mask.h"
+            "${libgav1_source}/quantizer.cc"
+            "${libgav1_source}/quantizer.h"
+            "${libgav1_source}/quantizer_tables.inc"
+            "${libgav1_source}/reconstruction.cc"
+            "${libgav1_source}/reconstruction.h"
+            "${libgav1_source}/residual_buffer_pool.cc"
+            "${libgav1_source}/residual_buffer_pool.h"
+            "${libgav1_source}/scan_tables.inc"
+            "${libgav1_source}/symbol_decoder_context.cc"
+            "${libgav1_source}/symbol_decoder_context.h"
+            "${libgav1_source}/symbol_decoder_context_cdfs.inc"
+            "${libgav1_source}/threading_strategy.cc"
+            "${libgav1_source}/threading_strategy.h"
+            "${libgav1_source}/tile.h"
+            "${libgav1_source}/tile/bitstream/mode_info.cc"
+            "${libgav1_source}/tile/bitstream/palette.cc"
+            "${libgav1_source}/tile/bitstream/partition.cc"
+            "${libgav1_source}/tile/bitstream/transform_size.cc"
+            "${libgav1_source}/tile/prediction.cc"
+            "${libgav1_source}/tile/tile.cc"
+            "${libgav1_source}/warp_prediction.cc"
+            "${libgav1_source}/warp_prediction.h"
+            "${libgav1_source}/yuv_buffer.cc"
+            "${libgav1_source}/yuv_buffer.h")
+
+list(APPEND libgav1_api_includes "${libgav1_source}/gav1/decoder.h"
+            "${libgav1_source}/gav1/decoder_buffer.h"
+            "${libgav1_source}/gav1/decoder_settings.h"
+            "${libgav1_source}/gav1/frame_buffer.h"
+            "${libgav1_source}/gav1/status_code.h"
+            "${libgav1_source}/gav1/symbol_visibility.h"
+            "${libgav1_source}/gav1/version.h")
+
+list(APPEND libgav1_api_sources "${libgav1_source}/decoder.cc"
+            "${libgav1_source}/decoder_settings.cc"
+            "${libgav1_source}/status_code.cc"
+            "${libgav1_source}/version.cc"
+            ${libgav1_api_includes})
+
+macro(libgav1_add_decoder_targets)
+  if(BUILD_SHARED_LIBS)
+    if(MSVC OR WIN32)
+      # In order to produce a DLL and import library the Windows tools require
+      # that the exported symbols are part of the DLL target. The unfortunate
+      # side effect of this is that a single configuration cannot output both
+      # the static library and the DLL: This results in an either/or situation.
+      # Windows users of the libgav1 build can have a DLL and an import library,
+      # or they can have a static library; they cannot have both from a single
+      # configuration of the build.
+      list(APPEND libgav1_shared_lib_sources ${libgav1_api_sources})
+      list(APPEND libgav1_static_lib_sources ${libgav1_api_includes})
+    else()
+      list(APPEND libgav1_shared_lib_sources ${libgav1_api_includes})
+      list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+    endif()
+  else()
+    list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+  endif()
+
+  if(NOT ANDROID)
+    list(APPEND libgav1_absl_deps absl::base absl::synchronization)
+  endif()
+
+  libgav1_add_library(NAME libgav1_decoder TYPE OBJECT SOURCES
+                      ${libgav1_decoder_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_library(NAME
+                      libgav1_static
+                      OUTPUT_NAME
+                      libgav1
+                      TYPE
+                      STATIC
+                      SOURCES
+                      ${libgav1_static_lib_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_include_paths}
+                      LIB_DEPS
+                      ${libgav1_absl_deps}
+                      OBJLIB_DEPS
+                      libgav1_dsp
+                      libgav1_decoder
+                      libgav1_utils
+                      PUBLIC_INCLUDES
+                      ${libgav1_source})
+
+  if(BUILD_SHARED_LIBS)
+    libgav1_add_library(NAME
+                        libgav1_shared
+                        OUTPUT_NAME
+                        libgav1
+                        TYPE
+                        SHARED
+                        SOURCES
+                        ${libgav1_shared_lib_sources}
+                        DEFINES
+                        ${libgav1_defines}
+                        INCLUDES
+                        ${libgav1_include_paths}
+                        LIB_DEPS
+                        libgav1_static
+                        PUBLIC_INCLUDES
+                        ${libgav1_source})
+  endif()
+endmacro()
diff --git a/src/loop_restoration_info.cc b/src/loop_restoration_info.cc
new file mode 100644
index 0000000..2dba57d
--- /dev/null
+++ b/src/loop_restoration_info.cc
@@ -0,0 +1,240 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/loop_restoration_info.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Controls how self guided deltas are read.
+constexpr int kSgrProjReadControl = 4;
+// Maps the restoration type encoded in the compressed headers (restoration_type
+// element in the spec) of the bitstream to LoopRestorationType. This is used
+// only when the restoration type in the frame header is
+// LoopRestorationTypeSwitchable.
+constexpr LoopRestorationType kBitstreamRestorationTypeMap[] = {
+    kLoopRestorationTypeNone, kLoopRestorationTypeWiener,
+    kLoopRestorationTypeSgrProj};
+
+inline int CountLeadingZeroCoefficients(const int16_t* const filter) {
+  int number_zero_coefficients = 0;
+  if (filter[0] == 0) {
+    number_zero_coefficients++;
+    if (filter[1] == 0) {
+      number_zero_coefficients++;
+      if (filter[2] == 0) {
+        number_zero_coefficients++;
+      }
+    }
+  }
+  return number_zero_coefficients;
+}
+
+}  // namespace
+
+bool LoopRestorationInfo::Reset(const LoopRestoration* const loop_restoration,
+                                uint32_t width, uint32_t height,
+                                int8_t subsampling_x, int8_t subsampling_y,
+                                bool is_monochrome) {
+  loop_restoration_ = loop_restoration;
+  subsampling_x_ = subsampling_x;
+  subsampling_y_ = subsampling_y;
+
+  const int num_planes = is_monochrome ? kMaxPlanesMonochrome : kMaxPlanes;
+  int total_num_units = 0;
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+      plane_needs_filtering_[plane] = false;
+      continue;
+    }
+    plane_needs_filtering_[plane] = true;
+    const int plane_width =
+        (plane == kPlaneY) ? width : SubsampledValue(width, subsampling_x_);
+    const int plane_height =
+        (plane == kPlaneY) ? height : SubsampledValue(height, subsampling_y_);
+    num_horizontal_units_[plane] =
+        std::max(1, RightShiftWithRounding(
+                        plane_width, loop_restoration_->unit_size_log2[plane]));
+    num_vertical_units_[plane] = std::max(
+        1, RightShiftWithRounding(plane_height,
+                                  loop_restoration_->unit_size_log2[plane]));
+    num_units_[plane] =
+        num_horizontal_units_[plane] * num_vertical_units_[plane];
+    total_num_units += num_units_[plane];
+  }
+  // Allocate the RestorationUnitInfo arrays for all planes in a single heap
+  // allocation and divide up the buffer into arrays of the right sizes.
+  if (!loop_restoration_info_buffer_.Resize(total_num_units)) {
+    return false;
+  }
+  RestorationUnitInfo* loop_restoration_info =
+      loop_restoration_info_buffer_.get();
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    loop_restoration_info_[plane] = loop_restoration_info;
+    loop_restoration_info += num_units_[plane];
+  }
+  return true;
+}
+
+bool LoopRestorationInfo::PopulateUnitInfoForSuperBlock(
+    Plane plane, BlockSize block_size, bool is_superres_scaled,
+    uint8_t superres_scale_denominator, int row4x4, int column4x4,
+    LoopRestorationUnitInfo* const unit_info) const {
+  assert(unit_info != nullptr);
+  if (!plane_needs_filtering_[plane]) return false;
+  const int numerator_column =
+      is_superres_scaled ? superres_scale_denominator : 1;
+  const int pixel_column_start =
+      RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_);
+  const int pixel_column_end = RowOrColumn4x4ToPixel(
+      column4x4 + kNum4x4BlocksWide[block_size], plane, subsampling_x_);
+  const int unit_row_log2 = loop_restoration_->unit_size_log2[plane];
+  const int denominator_column_log2 =
+      unit_row_log2 + (is_superres_scaled ? 3 : 0);
+  const int pixel_row_start =
+      RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_);
+  const int pixel_row_end = RowOrColumn4x4ToPixel(
+      row4x4 + kNum4x4BlocksHigh[block_size], plane, subsampling_y_);
+  unit_info->column_start = RightShiftWithCeiling(
+      pixel_column_start * numerator_column, denominator_column_log2);
+  unit_info->column_end = RightShiftWithCeiling(
+      pixel_column_end * numerator_column, denominator_column_log2);
+  unit_info->row_start = RightShiftWithCeiling(pixel_row_start, unit_row_log2);
+  unit_info->row_end = RightShiftWithCeiling(pixel_row_end, unit_row_log2);
+  unit_info->column_end =
+      std::min(unit_info->column_end, num_horizontal_units_[plane]);
+  unit_info->row_end = std::min(unit_info->row_end, num_vertical_units_[plane]);
+  return true;
+}
+
+void LoopRestorationInfo::ReadUnitCoefficients(
+    DaalaBitReader* const reader,
+    SymbolDecoderContext* const symbol_decoder_context, Plane plane,
+    int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  LoopRestorationType unit_restoration_type = kLoopRestorationTypeNone;
+  if (loop_restoration_->type[plane] == kLoopRestorationTypeSwitchable) {
+    unit_restoration_type = kBitstreamRestorationTypeMap
+        [reader->ReadSymbol<kRestorationTypeSymbolCount>(
+            symbol_decoder_context->restoration_type_cdf)];
+  } else if (loop_restoration_->type[plane] == kLoopRestorationTypeWiener) {
+    const bool use_wiener =
+        reader->ReadSymbol(symbol_decoder_context->use_wiener_cdf);
+    if (use_wiener) unit_restoration_type = kLoopRestorationTypeWiener;
+  } else if (loop_restoration_->type[plane] == kLoopRestorationTypeSgrProj) {
+    const bool use_sgrproj =
+        reader->ReadSymbol(symbol_decoder_context->use_sgrproj_cdf);
+    if (use_sgrproj) unit_restoration_type = kLoopRestorationTypeSgrProj;
+  }
+  loop_restoration_info_[plane][unit_id].type = unit_restoration_type;
+
+  if (unit_restoration_type == kLoopRestorationTypeWiener) {
+    ReadWienerInfo(reader, plane, unit_id, reference_unit_info);
+  } else if (unit_restoration_type == kLoopRestorationTypeSgrProj) {
+    ReadSgrProjInfo(reader, plane, unit_id, reference_unit_info);
+  }
+}
+
+void LoopRestorationInfo::ReadWienerInfo(
+    DaalaBitReader* const reader, Plane plane, int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+    if (plane != kPlaneY) {
+      loop_restoration_info_[plane][unit_id].wiener_info.filter[i][0] = 0;
+    }
+    int sum = 0;
+    for (int j = static_cast<int>(plane != kPlaneY); j < kNumWienerCoefficients;
+         ++j) {
+      const int8_t wiener_min = kWienerTapsMin[j];
+      const int8_t wiener_max = kWienerTapsMax[j];
+      const int control = j + 1;
+      int value;
+      if (!reader->DecodeSignedSubexpWithReference(
+              wiener_min, wiener_max + 1,
+              (*reference_unit_info)[plane].wiener_info.filter[i][j], control,
+              &value)) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "Error decoding Wiener filter coefficients: plane %d, unit_id %d",
+            static_cast<int>(plane), unit_id);
+        return;
+      }
+      loop_restoration_info_[plane][unit_id].wiener_info.filter[i][j] = value;
+      (*reference_unit_info)[plane].wiener_info.filter[i][j] = value;
+      sum += value;
+    }
+    loop_restoration_info_[plane][unit_id].wiener_info.filter[i][3] =
+        128 - 2 * sum;
+    loop_restoration_info_[plane][unit_id]
+        .wiener_info.number_leading_zero_coefficients[i] =
+        CountLeadingZeroCoefficients(
+            loop_restoration_info_[plane][unit_id].wiener_info.filter[i]);
+  }
+}
+
+void LoopRestorationInfo::ReadSgrProjInfo(
+    DaalaBitReader* const reader, Plane plane, int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  const int sgr_proj_index =
+      static_cast<int>(reader->ReadLiteral(kSgrProjParamsBits));
+  loop_restoration_info_[plane][unit_id].sgr_proj_info.index = sgr_proj_index;
+  for (int i = 0; i < 2; ++i) {
+    const uint8_t radius = kSgrProjParams[sgr_proj_index][i * 2];
+    const int8_t multiplier_min = kSgrProjMultiplierMin[i];
+    const int8_t multiplier_max = kSgrProjMultiplierMax[i];
+    int multiplier;
+    if (radius != 0) {
+      if (!reader->DecodeSignedSubexpWithReference(
+              multiplier_min, multiplier_max + 1,
+              (*reference_unit_info)[plane].sgr_proj_info.multiplier[i],
+              kSgrProjReadControl, &multiplier)) {
+        LIBGAV1_DLOG(ERROR,
+                     "Error decoding Self-guided filter coefficients: plane "
+                     "%d, unit_id %d",
+                     static_cast<int>(plane), unit_id);
+        return;
+      }
+    } else {
+      // The range of (*reference_unit_info)[plane].sgr_proj_info.multiplier[0]
+      // from DecodeSignedSubexpWithReference() is [-96, 31], the default is
+      // -32, making Clip3(128 - 31, -32, 95) unnecessary.
+      static constexpr int kMultiplier[2] = {0, 95};
+      multiplier = kMultiplier[i];
+      assert(
+          i == 0 ||
+          Clip3((1 << kSgrProjPrecisionBits) -
+                    (*reference_unit_info)[plane].sgr_proj_info.multiplier[0],
+                multiplier_min, multiplier_max) == kMultiplier[1]);
+    }
+    loop_restoration_info_[plane][unit_id].sgr_proj_info.multiplier[i] =
+        multiplier;
+    (*reference_unit_info)[plane].sgr_proj_info.multiplier[i] = multiplier;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/loop_restoration_info.h b/src/loop_restoration_info.h
new file mode 100644
index 0000000..f174b89
--- /dev/null
+++ b/src/loop_restoration_info.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+#define LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "src/dsp/common.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct LoopRestorationUnitInfo {
+  int row_start;
+  int row_end;
+  int column_start;
+  int column_end;
+};
+
+class LoopRestorationInfo {
+ public:
+  LoopRestorationInfo() = default;
+
+  // Non copyable/movable.
+  LoopRestorationInfo(const LoopRestorationInfo&) = delete;
+  LoopRestorationInfo& operator=(const LoopRestorationInfo&) = delete;
+  LoopRestorationInfo(LoopRestorationInfo&&) = delete;
+  LoopRestorationInfo& operator=(LoopRestorationInfo&&) = delete;
+
+  bool Reset(const LoopRestoration* loop_restoration, uint32_t width,
+             uint32_t height, int8_t subsampling_x, int8_t subsampling_y,
+             bool is_monochrome);
+  // Populates the |unit_info| for the super block at |row4x4|, |column4x4|.
+  // Returns true on success, false otherwise.
+  bool PopulateUnitInfoForSuperBlock(Plane plane, BlockSize block_size,
+                                     bool is_superres_scaled,
+                                     uint8_t superres_scale_denominator,
+                                     int row4x4, int column4x4,
+                                     LoopRestorationUnitInfo* unit_info) const;
+  void ReadUnitCoefficients(DaalaBitReader* reader,
+                            SymbolDecoderContext* symbol_decoder_context,
+                            Plane plane, int unit_id,
+                            std::array<RestorationUnitInfo, kMaxPlanes>*
+                                reference_unit_info);  // 5.11.58.
+  void ReadWienerInfo(
+      DaalaBitReader* reader, Plane plane, int unit_id,
+      std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+  void ReadSgrProjInfo(
+      DaalaBitReader* reader, Plane plane, int unit_id,
+      std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+
+  // Getters.
+  const RestorationUnitInfo* loop_restoration_info(Plane plane,
+                                                   int unit_id) const {
+    return &loop_restoration_info_[plane][unit_id];
+  }
+
+  int num_horizontal_units(Plane plane) const {
+    return num_horizontal_units_[plane];
+  }
+  int num_vertical_units(Plane plane) const {
+    return num_vertical_units_[plane];
+  }
+  int num_units(Plane plane) const { return num_units_[plane]; }
+
+ private:
+  // If plane_needs_filtering_[plane] is true, loop_restoration_info_[plane]
+  // points to an array of num_units_[plane] elements.
+  RestorationUnitInfo* loop_restoration_info_[kMaxPlanes];
+  // Owns the memory that loop_restoration_info_[plane] points to.
+  DynamicBuffer<RestorationUnitInfo> loop_restoration_info_buffer_;
+  bool plane_needs_filtering_[kMaxPlanes];
+  const LoopRestoration* loop_restoration_;
+  int8_t subsampling_x_;
+  int8_t subsampling_y_;
+  int num_horizontal_units_[kMaxPlanes];
+  int num_vertical_units_[kMaxPlanes];
+  int num_units_[kMaxPlanes];
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
diff --git a/src/motion_vector.cc b/src/motion_vector.cc
new file mode 100644
index 0000000..fdb1875
--- /dev/null
+++ b/src/motion_vector.cc
@@ -0,0 +1,1001 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/motion_vector.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Entry at index i is computed as:
+// Clip3(std::max(kBlockWidthPixels[i], kBlockHeightPixels[i], 16, 112)).
+constexpr int kWarpValidThreshold[kMaxBlockSizes] = {
+    16, 16, 16, 16, 16, 16, 32, 16, 16,  16,  32,
+    64, 32, 32, 32, 64, 64, 64, 64, 112, 112, 112};
+
+// 7.10.2.10.
+void LowerMvPrecision(const ObuFrameHeader& frame_header,
+                      MotionVector* const mvs) {
+  if (frame_header.allow_high_precision_mv) return;
+  if (frame_header.force_integer_mv != 0) {
+    for (auto& mv : mvs->mv) {
+      // The next line is equivalent to:
+      // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+      // const int sign = mv >> 15;
+      // mv = ApplySign(value, sign);
+      mv = (mv + 3 - (mv >> 15)) & ~7;
+    }
+  } else {
+    for (auto& mv : mvs->mv) {
+      // The next line is equivalent to:
+      // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+      mv = (mv - (mv >> 15)) & ~1;
+    }
+  }
+}
+
+// 7.10.2.1.
+void SetupGlobalMv(const Tile::Block& block, int index,
+                   MotionVector* const mv) {
+  const BlockParameters& bp = *block.bp;
+  const ObuFrameHeader& frame_header = block.tile.frame_header();
+  ReferenceFrameType reference_type = bp.reference_frame[index];
+  const auto& gm = frame_header.global_motion[reference_type];
+  if (reference_type == kReferenceFrameIntra ||
+      gm.type == kGlobalMotionTransformationTypeIdentity) {
+    mv->mv32 = 0;
+    return;
+  }
+  if (gm.type == kGlobalMotionTransformationTypeTranslation) {
+    for (int i = 0; i < 2; ++i) {
+      mv->mv[i] = gm.params[i] >> (kWarpedModelPrecisionBits - 3);
+    }
+    LowerMvPrecision(frame_header, mv);
+    return;
+  }
+  const int x = MultiplyBy4(block.column4x4) + DivideBy2(block.width) - 1;
+  const int y = MultiplyBy4(block.row4x4) + DivideBy2(block.height) - 1;
+  const int xc = (gm.params[2] - (1 << kWarpedModelPrecisionBits)) * x +
+                 gm.params[3] * y + gm.params[0];
+  const int yc = gm.params[4] * x +
+                 (gm.params[5] - (1 << kWarpedModelPrecisionBits)) * y +
+                 gm.params[1];
+  if (frame_header.allow_high_precision_mv) {
+    mv->mv[MotionVector::kRow] =
+        RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 3);
+    mv->mv[MotionVector::kColumn] =
+        RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 3);
+  } else {
+    mv->mv[MotionVector::kRow] = MultiplyBy2(
+        RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 2));
+    mv->mv[MotionVector::kColumn] = MultiplyBy2(
+        RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 2));
+    LowerMvPrecision(frame_header, mv);
+  }
+}
+
+constexpr BitMaskSet kPredictionModeNewMvMask(kPredictionModeNewMv,
+                                              kPredictionModeNewNewMv,
+                                              kPredictionModeNearNewMv,
+                                              kPredictionModeNewNearMv,
+                                              kPredictionModeNearestNewMv,
+                                              kPredictionModeNewNearestMv);
+
+// 7.10.2.8.
+void SearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+                 int index, int weight, bool* const found_new_mv,
+                 bool* const found_match, int* const num_mv_found) {
+  const BlockParameters& bp = *block.bp;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+      block.tile.frame_header().global_motion;
+  PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  MotionVector candidate_mv;
+  // LowerMvPrecision() is not necessary, since the values in
+  // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+  const auto global_motion_type = global_motion[bp.reference_frame[0]].type;
+  if (IsGlobalMvBlock(mv_bp.is_global_mv_block, global_motion_type)) {
+    candidate_mv = prediction_parameters.global_mv[0];
+  } else {
+    candidate_mv = mv_bp.mv.mv[index];
+  }
+  *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+  *found_match = true;
+  MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+  const int num_found = *num_mv_found;
+  const auto result = std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                                   [&candidate_mv](const MotionVector& ref_mv) {
+                                     return ref_mv == candidate_mv;
+                                   });
+  if (result != ref_mv_stack + num_found) {
+    prediction_parameters.IncreaseWeight(std::distance(ref_mv_stack, result),
+                                         weight);
+    return;
+  }
+  if (num_found >= kMaxRefMvStackSize) return;
+  ref_mv_stack[num_found] = candidate_mv;
+  prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+  ++*num_mv_found;
+}
+
+// 7.10.2.9.
+void CompoundSearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+                         int weight, bool* const found_new_mv,
+                         bool* const found_match, int* const num_mv_found) {
+  const BlockParameters& bp = *block.bp;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+      block.tile.frame_header().global_motion;
+  PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  // LowerMvPrecision() is not necessary, since the values in
+  // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+  CompoundMotionVector candidate_mv = mv_bp.mv;
+  for (int i = 0; i < 2; ++i) {
+    const auto global_motion_type = global_motion[bp.reference_frame[i]].type;
+    if (IsGlobalMvBlock(mv_bp.is_global_mv_block, global_motion_type)) {
+      candidate_mv.mv[i] = prediction_parameters.global_mv[i];
+    }
+  }
+  *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+  *found_match = true;
+  CompoundMotionVector* const compound_ref_mv_stack =
+      prediction_parameters.compound_ref_mv_stack;
+  const int num_found = *num_mv_found;
+  const auto result =
+      std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+                   [&candidate_mv](const CompoundMotionVector& ref_mv) {
+                     return ref_mv == candidate_mv;
+                   });
+  if (result != compound_ref_mv_stack + num_found) {
+    prediction_parameters.IncreaseWeight(
+        std::distance(compound_ref_mv_stack, result), weight);
+    return;
+  }
+  if (num_found >= kMaxRefMvStackSize) return;
+  compound_ref_mv_stack[num_found] = candidate_mv;
+  prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+  ++*num_mv_found;
+}
+
+// 7.10.2.7.
+void AddReferenceMvCandidate(const Tile::Block& block,
+                             const BlockParameters& mv_bp, bool is_compound,
+                             int weight, bool* const found_new_mv,
+                             bool* const found_match, int* const num_mv_found) {
+  if (!mv_bp.is_inter) return;
+  const BlockParameters& bp = *block.bp;
+  if (is_compound) {
+    if (mv_bp.reference_frame[0] == bp.reference_frame[0] &&
+        mv_bp.reference_frame[1] == bp.reference_frame[1]) {
+      CompoundSearchStack(block, mv_bp, weight, found_new_mv, found_match,
+                          num_mv_found);
+    }
+    return;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (mv_bp.reference_frame[i] == bp.reference_frame[0]) {
+      SearchStack(block, mv_bp, i, weight, found_new_mv, found_match,
+                  num_mv_found);
+    }
+  }
+}
+
+int GetMinimumStep(int block_width_or_height4x4, int delta_row_or_column) {
+  assert(delta_row_or_column < 0);
+  if (block_width_or_height4x4 >= 16) return 4;
+  if (delta_row_or_column < -1) return 2;
+  return 0;
+}
+
+// 7.10.2.2.
+void ScanRow(const Tile::Block& block, int mv_column, int delta_row,
+             bool is_compound, bool* const found_new_mv,
+             bool* const found_match, int* const num_mv_found) {
+  const int mv_row = block.row4x4 + delta_row;
+  const Tile& tile = block.tile;
+  if (!tile.IsTopInside(mv_row + 1)) return;
+  const int width4x4 = block.width4x4;
+  const int min_step = GetMinimumStep(width4x4, delta_row);
+  BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+  BlockParameters** const end_bps =
+      bps + std::min({static_cast<int>(width4x4),
+                      tile.frame_header().columns4x4 - block.column4x4, 16});
+  do {
+    const BlockParameters& mv_bp = **bps;
+    const int step = std::max(
+        std::min(width4x4, static_cast<int>(kNum4x4BlocksWide[mv_bp.size])),
+        min_step);
+    AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+                            found_new_mv, found_match, num_mv_found);
+    bps += step;
+  } while (bps < end_bps);
+}
+
+// 7.10.2.3.
+void ScanColumn(const Tile::Block& block, int mv_row, int delta_column,
+                bool is_compound, bool* const found_new_mv,
+                bool* const found_match, int* const num_mv_found) {
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsLeftInside(mv_column + 1)) return;
+  const int height4x4 = block.height4x4;
+  const int min_step = GetMinimumStep(height4x4, delta_column);
+  const ptrdiff_t stride = tile.BlockParametersStride();
+  BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+  BlockParameters** const end_bps =
+      bps + stride * std::min({static_cast<int>(height4x4),
+                               tile.frame_header().rows4x4 - block.row4x4, 16});
+  do {
+    const BlockParameters& mv_bp = **bps;
+    const int step = std::max(
+        std::min(height4x4, static_cast<int>(kNum4x4BlocksHigh[mv_bp.size])),
+        min_step);
+    AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+                            found_new_mv, found_match, num_mv_found);
+    bps += step * stride;
+  } while (bps < end_bps);
+}
+
+// 7.10.2.4.
+void ScanPoint(const Tile::Block& block, int delta_row, int delta_column,
+               bool is_compound, bool* const found_new_mv,
+               bool* const found_match, int* const num_mv_found) {
+  const int mv_row = block.row4x4 + delta_row;
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsInside(mv_row, mv_column) ||
+      !tile.HasParameters(mv_row, mv_column)) {
+    return;
+  }
+  const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+  if (mv_bp.reference_frame[0] == kReferenceFrameNone) return;
+  AddReferenceMvCandidate(block, mv_bp, is_compound, 4, found_new_mv,
+                          found_match, num_mv_found);
+}
+
+// 7.10.2.6.
+void AddTemporalReferenceMvCandidate(
+    const ObuFrameHeader& frame_header, const int reference_offsets[2],
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets, int count, bool is_compound,
+    int* const zero_mv_context, int* const num_mv_found,
+    PredictionParameters* const prediction_parameters) {
+  const int mv_projection_function_index =
+      frame_header.allow_high_precision_mv ? 2 : frame_header.force_integer_mv;
+  const MotionVector* const global_mv = prediction_parameters->global_mv;
+  if (is_compound) {
+    CompoundMotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+    const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+    dsp.mv_projection_compound[mv_projection_function_index](
+        temporal_mvs, temporal_reference_offsets, reference_offsets, count,
+        candidate_mvs);
+    if (*zero_mv_context == -1) {
+      int max_difference =
+          std::max(std::abs(candidate_mvs[0].mv[0].mv[0] - global_mv[0].mv[0]),
+                   std::abs(candidate_mvs[0].mv[0].mv[1] - global_mv[0].mv[1]));
+      max_difference =
+          std::max(max_difference,
+                   std::abs(candidate_mvs[0].mv[1].mv[0] - global_mv[1].mv[0]));
+      max_difference =
+          std::max(max_difference,
+                   std::abs(candidate_mvs[0].mv[1].mv[1] - global_mv[1].mv[1]));
+      *zero_mv_context = static_cast<int>(max_difference >= 16);
+    }
+    CompoundMotionVector* const compound_ref_mv_stack =
+        prediction_parameters->compound_ref_mv_stack;
+    int num_found = *num_mv_found;
+    int index = 0;
+    do {
+      const CompoundMotionVector& candidate_mv = candidate_mvs[index];
+      const auto result =
+          std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+                       [&candidate_mv](const CompoundMotionVector& ref_mv) {
+                         return ref_mv == candidate_mv;
+                       });
+      if (result != compound_ref_mv_stack + num_found) {
+        prediction_parameters->IncreaseWeight(
+            std::distance(compound_ref_mv_stack, result), 2);
+        continue;
+      }
+      if (num_found >= kMaxRefMvStackSize) continue;
+      compound_ref_mv_stack[num_found] = candidate_mv;
+      prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+      ++num_found;
+    } while (++index < count);
+    *num_mv_found = num_found;
+    return;
+  }
+  MotionVector* const ref_mv_stack = prediction_parameters->ref_mv_stack;
+  if (reference_offsets[0] == 0) {
+    if (*zero_mv_context == -1) {
+      const int max_difference =
+          std::max(std::abs(global_mv[0].mv[0]), std::abs(global_mv[0].mv[1]));
+      *zero_mv_context = static_cast<int>(max_difference >= 16);
+    }
+    const MotionVector candidate_mv = {};
+    const int num_found = *num_mv_found;
+    const auto result =
+        std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                     [&candidate_mv](const MotionVector& ref_mv) {
+                       return ref_mv == candidate_mv;
+                     });
+    if (result != ref_mv_stack + num_found) {
+      prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+                                            2 * count);
+      return;
+    }
+    if (num_found >= kMaxRefMvStackSize) return;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters->SetWeightIndexStackEntry(num_found, 2 * count);
+    ++*num_mv_found;
+    return;
+  }
+  alignas(kMaxAlignment)
+      MotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+  const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+  dsp.mv_projection_single[mv_projection_function_index](
+      temporal_mvs, temporal_reference_offsets, reference_offsets[0], count,
+      candidate_mvs);
+  if (*zero_mv_context == -1) {
+    const int max_difference =
+        std::max(std::abs(candidate_mvs[0].mv[0] - global_mv[0].mv[0]),
+                 std::abs(candidate_mvs[0].mv[1] - global_mv[0].mv[1]));
+    *zero_mv_context = static_cast<int>(max_difference >= 16);
+  }
+  int num_found = *num_mv_found;
+  int index = 0;
+  do {
+    const MotionVector& candidate_mv = candidate_mvs[index];
+    const auto result =
+        std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                     [&candidate_mv](const MotionVector& ref_mv) {
+                       return ref_mv == candidate_mv;
+                     });
+    if (result != ref_mv_stack + num_found) {
+      prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+                                            2);
+      continue;
+    }
+    if (num_found >= kMaxRefMvStackSize) continue;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+    ++num_found;
+  } while (++index < count);
+  *num_mv_found = num_found;
+}
+
+// Part of 7.10.2.5.
+bool IsWithinTheSame64x64Block(const Tile::Block& block, int delta_row,
+                               int delta_column) {
+  const int row = (block.row4x4 & 15) + delta_row;
+  const int column = (block.column4x4 & 15) + delta_column;
+  // |block.height4x4| is at least 2 for all elements in |kTemporalScanMask|.
+  // So |row| are all non-negative.
+  assert(row >= 0);
+  return row < 16 && column >= 0 && column < 16;
+}
+
+constexpr BitMaskSet kTemporalScanMask(kBlock8x8, kBlock8x16, kBlock8x32,
+                                       kBlock16x8, kBlock16x16, kBlock16x32,
+                                       kBlock32x8, kBlock32x16, kBlock32x32);
+
+// 7.10.2.5.
+void TemporalScan(const Tile::Block& block, bool is_compound,
+                  int* const zero_mv_context, int* const num_mv_found) {
+  const int step_w = (block.width4x4 >= 16) ? 4 : 2;
+  const int step_h = (block.height4x4 >= 16) ? 4 : 2;
+  const int row_start = block.row4x4 | 1;
+  const int column_start = block.column4x4 | 1;
+  const int row_end =
+      row_start + std::min(static_cast<int>(block.height4x4), 16);
+  const int column_end =
+      column_start + std::min(static_cast<int>(block.width4x4), 16);
+  const Tile& tile = block.tile;
+  const TemporalMotionField& motion_field = tile.motion_field();
+  const int stride = motion_field.mv.columns();
+  const MotionVector* motion_field_mv = motion_field.mv[0];
+  const int8_t* motion_field_reference_offset =
+      motion_field.reference_offset[0];
+  alignas(kMaxAlignment)
+      MotionVector temporal_mvs[kMaxTemporalMvCandidatesWithPadding];
+  int8_t temporal_reference_offsets[kMaxTemporalMvCandidatesWithPadding];
+  int count = 0;
+  int offset = stride * (row_start >> 1);
+  int mv_row = row_start;
+  do {
+    int mv_column = column_start;
+    do {
+      // Both horizontal and vertical offsets are positive. Only bottom and
+      // right boundaries need to be checked.
+      if (tile.IsBottomRightInside(mv_row, mv_column)) {
+        const int x8 = mv_column >> 1;
+        const MotionVector temporal_mv = motion_field_mv[offset + x8];
+        if (temporal_mv.mv[0] == kInvalidMvValue) {
+          if (mv_row == row_start && mv_column == column_start) {
+            *zero_mv_context = 1;
+          }
+        } else {
+          temporal_mvs[count] = temporal_mv;
+          temporal_reference_offsets[count++] =
+              motion_field_reference_offset[offset + x8];
+        }
+      }
+      mv_column += step_w;
+    } while (mv_column < column_end);
+    offset += stride * step_h >> 1;
+    mv_row += step_h;
+  } while (mv_row < row_end);
+  if (kTemporalScanMask.Contains(block.size)) {
+    const int temporal_sample_positions[3][2] = {
+        {block.height4x4, -2},
+        {block.height4x4, block.width4x4},
+        {block.height4x4 - 2, block.width4x4}};
+    // Getting the address of an element in Array2D is slow. Precalculate the
+    // offsets.
+    int temporal_sample_offsets[3];
+    temporal_sample_offsets[0] = stride * ((row_start + block.height4x4) >> 1) +
+                                 ((column_start - 2) >> 1);
+    temporal_sample_offsets[1] =
+        temporal_sample_offsets[0] + ((block.width4x4 + 2) >> 1);
+    temporal_sample_offsets[2] = temporal_sample_offsets[1] - stride;
+    for (int i = 0; i < 3; i++) {
+      const int row = temporal_sample_positions[i][0];
+      const int column = temporal_sample_positions[i][1];
+      if (!IsWithinTheSame64x64Block(block, row, column)) continue;
+      const int mv_row = row_start + row;
+      const int mv_column = column_start + column;
+      // IsWithinTheSame64x64Block() guarantees the reference block is inside
+      // the top and left boundary.
+      if (!tile.IsBottomRightInside(mv_row, mv_column)) continue;
+      const MotionVector temporal_mv =
+          motion_field_mv[temporal_sample_offsets[i]];
+      if (temporal_mv.mv[0] != kInvalidMvValue) {
+        temporal_mvs[count] = temporal_mv;
+        temporal_reference_offsets[count++] =
+            motion_field_reference_offset[temporal_sample_offsets[i]];
+      }
+    }
+  }
+  if (count != 0) {
+    BlockParameters* const bp = block.bp;
+    int reference_offsets[2];
+    const int offset_0 = tile.current_frame()
+                             .reference_info()
+                             ->relative_distance_to[bp->reference_frame[0]];
+    reference_offsets[0] =
+        Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance);
+    if (is_compound) {
+      const int offset_1 = tile.current_frame()
+                               .reference_info()
+                               ->relative_distance_to[bp->reference_frame[1]];
+      reference_offsets[1] =
+          Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance);
+      // Pad so that SIMD implementations won't read uninitialized memory.
+      if ((count & 1) != 0) {
+        temporal_mvs[count].mv32 = 0;
+        temporal_reference_offsets[count] = 0;
+      }
+    } else {
+      // Pad so that SIMD implementations won't read uninitialized memory.
+      for (int i = count; i < ((count + 3) & ~3); ++i) {
+        temporal_mvs[i].mv32 = 0;
+        temporal_reference_offsets[i] = 0;
+      }
+    }
+    AddTemporalReferenceMvCandidate(
+        tile.frame_header(), reference_offsets, temporal_mvs,
+        temporal_reference_offsets, count, is_compound, zero_mv_context,
+        num_mv_found, &(*bp->prediction_parameters));
+  }
+}
+
+// Part of 7.10.2.13.
+void AddExtraCompoundMvCandidate(const Tile::Block& block, int mv_row,
+                                 int mv_column, int* const ref_id_count,
+                                 MotionVector ref_id[2][2],
+                                 int* const ref_diff_count,
+                                 MotionVector ref_diff[2][2]) {
+  const auto& bp = block.tile.Parameters(mv_row, mv_column);
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+      block.tile.reference_frame_sign_bias();
+  for (int i = 0; i < 2; ++i) {
+    const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+    if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+    for (int j = 0; j < 2; ++j) {
+      MotionVector candidate_mv = bp.mv.mv[i];
+      const ReferenceFrameType block_reference_frame =
+          block.bp->reference_frame[j];
+      if (candidate_reference_frame == block_reference_frame &&
+          ref_id_count[j] < 2) {
+        ref_id[j][ref_id_count[j]] = candidate_mv;
+        ++ref_id_count[j];
+      } else if (ref_diff_count[j] < 2) {
+        if (reference_frame_sign_bias[candidate_reference_frame] !=
+            reference_frame_sign_bias[block_reference_frame]) {
+          candidate_mv.mv[0] *= -1;
+          candidate_mv.mv[1] *= -1;
+        }
+        ref_diff[j][ref_diff_count[j]] = candidate_mv;
+        ++ref_diff_count[j];
+      }
+    }
+  }
+}
+
+// Part of 7.10.2.13.
+void AddExtraSingleMvCandidate(const Tile::Block& block, int mv_row,
+                               int mv_column, int* const num_mv_found) {
+  const auto& bp = block.tile.Parameters(mv_row, mv_column);
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+      block.tile.reference_frame_sign_bias();
+  const ReferenceFrameType block_reference_frame = block.bp->reference_frame[0];
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+  int num_found = *num_mv_found;
+  for (int i = 0; i < 2; ++i) {
+    const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+    if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+    MotionVector candidate_mv = bp.mv.mv[i];
+    if (reference_frame_sign_bias[candidate_reference_frame] !=
+        reference_frame_sign_bias[block_reference_frame]) {
+      candidate_mv.mv[0] *= -1;
+      candidate_mv.mv[1] *= -1;
+    }
+    assert(num_found <= 2);
+    if ((num_found != 0 && ref_mv_stack[0] == candidate_mv) ||
+        (num_found == 2 && ref_mv_stack[1] == candidate_mv)) {
+      continue;
+    }
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters.SetWeightIndexStackEntry(num_found, 0);
+    ++num_found;
+  }
+  *num_mv_found = num_found;
+}
+
+// 7.10.2.12.
+void ExtraSearch(const Tile::Block& block, bool is_compound,
+                 int* const num_mv_found) {
+  const Tile& tile = block.tile;
+  const int num4x4 = std::min({static_cast<int>(block.width4x4),
+                               tile.frame_header().columns4x4 - block.column4x4,
+                               static_cast<int>(block.height4x4),
+                               tile.frame_header().rows4x4 - block.row4x4, 16});
+  int ref_id_count[2] = {};
+  MotionVector ref_id[2][2] = {};
+  int ref_diff_count[2] = {};
+  MotionVector ref_diff[2][2] = {};
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int pass = 0; pass < 2 && *num_mv_found < 2; ++pass) {
+    for (int i = 0; i < num4x4;) {
+      const int mv_row = block.row4x4 + ((pass == 0) ? -1 : i);
+      const int mv_column = block.column4x4 + ((pass == 0) ? i : -1);
+      if (!tile.IsTopLeftInside(mv_row + 1, mv_column + 1)) break;
+      if (is_compound) {
+        AddExtraCompoundMvCandidate(block, mv_row, mv_column, ref_id_count,
+                                    ref_id, ref_diff_count, ref_diff);
+      } else {
+        AddExtraSingleMvCandidate(block, mv_row, mv_column, num_mv_found);
+        if (*num_mv_found >= 2) break;
+      }
+      const auto& bp = tile.Parameters(mv_row, mv_column);
+      i +=
+          (pass == 0) ? kNum4x4BlocksWide[bp.size] : kNum4x4BlocksHigh[bp.size];
+    }
+  }
+  if (is_compound) {
+    // Merge compound mode extra search into mv stack.
+    CompoundMotionVector* const compound_ref_mv_stack =
+        prediction_parameters.compound_ref_mv_stack;
+    CompoundMotionVector combined_mvs[2] = {};
+    for (int i = 0; i < 2; ++i) {
+      int count = 0;
+      assert(ref_id_count[i] <= 2);
+      for (int j = 0; j < ref_id_count[i]; ++j, ++count) {
+        combined_mvs[count].mv[i] = ref_id[i][j];
+      }
+      for (int j = 0; j < ref_diff_count[i] && count < 2; ++j, ++count) {
+        combined_mvs[count].mv[i] = ref_diff[i][j];
+      }
+      for (; count < 2; ++count) {
+        combined_mvs[count].mv[i] = prediction_parameters.global_mv[i];
+      }
+    }
+    if (*num_mv_found == 1) {
+      if (combined_mvs[0] == compound_ref_mv_stack[0]) {
+        compound_ref_mv_stack[1] = combined_mvs[1];
+      } else {
+        compound_ref_mv_stack[1] = combined_mvs[0];
+      }
+      prediction_parameters.SetWeightIndexStackEntry(1, 0);
+    } else {
+      assert(*num_mv_found == 0);
+      for (int i = 0; i < 2; ++i) {
+        compound_ref_mv_stack[i] = combined_mvs[i];
+        prediction_parameters.SetWeightIndexStackEntry(i, 0);
+      }
+    }
+    *num_mv_found = 2;
+  } else {
+    // single prediction mode
+    MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+    for (int i = *num_mv_found; i < 2; ++i) {
+      ref_mv_stack[i] = prediction_parameters.global_mv[0];
+      prediction_parameters.SetWeightIndexStackEntry(i, 0);
+    }
+  }
+}
+
+void DescendingOrderTwo(int* const a, int* const b) {
+  if (*a < *b) {
+    std::swap(*a, *b);
+  }
+}
+
+// Comparator used for sorting candidate motion vectors in descending order of
+// their weights (as specified in 7.10.2.11).
+bool CompareCandidateMotionVectors(const int16_t& lhs, const int16_t& rhs) {
+  return lhs > rhs;
+}
+
+void SortWeightIndexStack(const int size, const int sort_to_n,
+                          int16_t* const weight_index_stack) {
+  if (size <= 1) return;
+  if (size <= 3) {
+    // Specialize small sort sizes to speed up.
+    int weight_index_0 = weight_index_stack[0];
+    int weight_index_1 = weight_index_stack[1];
+    DescendingOrderTwo(&weight_index_0, &weight_index_1);
+    if (size == 3) {
+      int weight_index_2 = weight_index_stack[2];
+      DescendingOrderTwo(&weight_index_1, &weight_index_2);
+      DescendingOrderTwo(&weight_index_0, &weight_index_1);
+      weight_index_stack[2] = weight_index_2;
+    }
+    weight_index_stack[0] = weight_index_0;
+    weight_index_stack[1] = weight_index_1;
+    return;
+  }
+  if (sort_to_n == 1) {
+    // std::max_element() is not efficient. Find the max element in a loop.
+    int16_t max_element = weight_index_stack[0];
+    int i = 1;
+    do {
+      max_element = std::max(max_element, weight_index_stack[i]);
+    } while (++i < size);
+    weight_index_stack[0] = max_element;
+    return;
+  }
+  std::partial_sort(&weight_index_stack[0], &weight_index_stack[sort_to_n],
+                    &weight_index_stack[size], CompareCandidateMotionVectors);
+}
+
+// 7.10.2.14 (part 2).
+void ComputeContexts(bool found_new_mv, int nearest_matches, int total_matches,
+                     int* new_mv_context, int* reference_mv_context) {
+  switch (nearest_matches) {
+    case 0:
+      *new_mv_context = std::min(total_matches, 1);
+      *reference_mv_context = total_matches;
+      break;
+    case 1:
+      *new_mv_context = 3 - static_cast<int>(found_new_mv);
+      *reference_mv_context = 2 + total_matches;
+      break;
+    default:
+      *new_mv_context = 5 - static_cast<int>(found_new_mv);
+      *reference_mv_context = 5;
+      break;
+  }
+}
+
+// 7.10.4.2.
+void AddSample(const Tile::Block& block, int delta_row, int delta_column,
+               int* const num_warp_samples, int* const num_samples_scanned,
+               int candidates[kMaxLeastSquaresSamples][4]) {
+  if (*num_samples_scanned >= kMaxLeastSquaresSamples) return;
+  const int mv_row = block.row4x4 + delta_row;
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsInside(mv_row, mv_column) ||
+      !tile.HasParameters(mv_row, mv_column)) {
+    return;
+  }
+  const BlockParameters& bp = *block.bp;
+  const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+  if (mv_bp.reference_frame[0] != bp.reference_frame[0] ||
+      mv_bp.reference_frame[1] != kReferenceFrameNone) {
+    return;
+  }
+  ++*num_samples_scanned;
+  const int candidate_height4x4 = kNum4x4BlocksHigh[mv_bp.size];
+  const int candidate_row = mv_row & ~(candidate_height4x4 - 1);
+  const int candidate_width4x4 = kNum4x4BlocksWide[mv_bp.size];
+  const int candidate_column = mv_column & ~(candidate_width4x4 - 1);
+  const BlockParameters& candidate_bp =
+      tile.Parameters(candidate_row, candidate_column);
+  const int mv_diff_row =
+      std::abs(candidate_bp.mv.mv[0].mv[0] - bp.mv.mv[0].mv[0]);
+  const int mv_diff_column =
+      std::abs(candidate_bp.mv.mv[0].mv[1] - bp.mv.mv[0].mv[1]);
+  const bool is_valid =
+      mv_diff_row + mv_diff_column <= kWarpValidThreshold[block.size];
+  if (!is_valid && *num_samples_scanned > 1) {
+    return;
+  }
+  const int mid_y =
+      MultiplyBy4(candidate_row) + MultiplyBy2(candidate_height4x4) - 1;
+  const int mid_x =
+      MultiplyBy4(candidate_column) + MultiplyBy2(candidate_width4x4) - 1;
+  candidates[*num_warp_samples][0] = MultiplyBy8(mid_y);
+  candidates[*num_warp_samples][1] = MultiplyBy8(mid_x);
+  candidates[*num_warp_samples][2] =
+      MultiplyBy8(mid_y) + candidate_bp.mv.mv[0].mv[0];
+  candidates[*num_warp_samples][3] =
+      MultiplyBy8(mid_x) + candidate_bp.mv.mv[0].mv[1];
+  if (is_valid) ++*num_warp_samples;
+}
+
+// 7.9.2.
+// In the spec, |dst_sign| is either 1 or -1. Here we set |dst_sign| to either 0
+// or -1 so that it can be XORed and subtracted directly in ApplySign() and
+// corresponding SIMD implementations.
+bool MotionFieldProjection(
+    const ObuFrameHeader& frame_header,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign,
+    int y8_start, int y8_end, int x8_start, int x8_end,
+    TemporalMotionField* const motion_field) {
+  const int source_index =
+      frame_header.reference_frame_index[source - kReferenceFrameLast];
+  auto* const source_frame = reference_frames[source_index].get();
+  assert(source_frame != nullptr);
+  assert(dst_sign == 0 || dst_sign == -1);
+  if (source_frame->rows4x4() != frame_header.rows4x4 ||
+      source_frame->columns4x4() != frame_header.columns4x4 ||
+      IsIntraFrame(source_frame->frame_type())) {
+    return false;
+  }
+  assert(reference_to_current_with_sign >= -kMaxFrameDistance);
+  if (reference_to_current_with_sign > kMaxFrameDistance) return true;
+  const ReferenceInfo& reference_info = *source_frame->reference_info();
+  const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+  dsp.motion_field_projection_kernel(
+      reference_info, reference_to_current_with_sign, dst_sign, y8_start,
+      y8_end, x8_start, x8_end, motion_field);
+  return true;
+}
+
+}  // namespace
+
+void FindMvStack(const Tile::Block& block, bool is_compound,
+                 MvContexts* const contexts) {
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  SetupGlobalMv(block, 0, &prediction_parameters.global_mv[0]);
+  if (is_compound) SetupGlobalMv(block, 1, &prediction_parameters.global_mv[1]);
+  bool found_new_mv = false;
+  bool found_row_match = false;
+  int num_mv_found = 0;
+  ScanRow(block, block.column4x4, -1, is_compound, &found_new_mv,
+          &found_row_match, &num_mv_found);
+  bool found_column_match = false;
+  ScanColumn(block, block.row4x4, -1, is_compound, &found_new_mv,
+             &found_column_match, &num_mv_found);
+  if (std::max(block.width4x4, block.height4x4) <= 16) {
+    ScanPoint(block, -1, block.width4x4, is_compound, &found_new_mv,
+              &found_row_match, &num_mv_found);
+  }
+  const int nearest_matches =
+      static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+  prediction_parameters.nearest_mv_count = num_mv_found;
+  if (block.tile.frame_header().use_ref_frame_mvs) {
+    // Initialize to invalid value, and it will be set when temporal mv is zero.
+    contexts->zero_mv = -1;
+    TemporalScan(block, is_compound, &contexts->zero_mv, &num_mv_found);
+  } else {
+    contexts->zero_mv = 0;
+  }
+  bool dummy_bool = false;
+  ScanPoint(block, -1, -1, is_compound, &dummy_bool, &found_row_match,
+            &num_mv_found);
+  static constexpr int deltas[2] = {-3, -5};
+  for (int i = 0; i < 2; ++i) {
+    if (i == 0 || block.height4x4 > 1) {
+      ScanRow(block, block.column4x4 | 1, deltas[i] + (block.row4x4 & 1),
+              is_compound, &dummy_bool, &found_row_match, &num_mv_found);
+    }
+    if (i == 0 || block.width4x4 > 1) {
+      ScanColumn(block, block.row4x4 | 1, deltas[i] + (block.column4x4 & 1),
+                 is_compound, &dummy_bool, &found_column_match, &num_mv_found);
+    }
+  }
+  if (num_mv_found < 2) {
+    ExtraSearch(block, is_compound, &num_mv_found);
+  } else {
+    // The sort of |weight_index_stack| could be moved to Tile::AssignIntraMv()
+    // and Tile::AssignInterMv(), and only do a partial sort to the max index we
+    // need. However, the speed gain is trivial.
+    // For intra case, only the first 1 or 2 mvs in the stack will be used.
+    // For inter case, |prediction_parameters.ref_mv_index| is at most 3.
+    // We only need to do the partial sort up to the first 4 mvs.
+    SortWeightIndexStack(prediction_parameters.nearest_mv_count, 4,
+                         prediction_parameters.weight_index_stack);
+    // When there are 4 or more nearest mvs, the other mvs will not be used.
+    if (prediction_parameters.nearest_mv_count < 4) {
+      SortWeightIndexStack(
+          num_mv_found - prediction_parameters.nearest_mv_count,
+          4 - prediction_parameters.nearest_mv_count,
+          prediction_parameters.weight_index_stack +
+              prediction_parameters.nearest_mv_count);
+    }
+  }
+  prediction_parameters.ref_mv_count = num_mv_found;
+  const int total_matches =
+      static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+  ComputeContexts(found_new_mv, nearest_matches, total_matches,
+                  &contexts->new_mv, &contexts->reference_mv);
+  // The mv stack clamping process is in Tile::AssignIntraMv() and
+  // Tile::AssignInterMv(), and only up to two mvs are clamped.
+}
+
+void FindWarpSamples(const Tile::Block& block, int* const num_warp_samples,
+                     int* const num_samples_scanned,
+                     int candidates[kMaxLeastSquaresSamples][4]) {
+  const Tile& tile = block.tile;
+  bool top_left = true;
+  bool top_right = true;
+  int step = 1;
+  if (block.top_available[kPlaneY]) {
+    BlockSize source_size =
+        tile.Parameters(block.row4x4 - 1, block.column4x4).size;
+    const int source_width4x4 = kNum4x4BlocksWide[source_size];
+    if (block.width4x4 <= source_width4x4) {
+      // The & here is equivalent to % since source_width4x4 is a power of two.
+      const int column_offset = -(block.column4x4 & (source_width4x4 - 1));
+      if (column_offset < 0) top_left = false;
+      if (column_offset + source_width4x4 > block.width4x4) top_right = false;
+      AddSample(block, -1, 0, num_warp_samples, num_samples_scanned,
+                candidates);
+    } else {
+      for (int i = 0;
+           i < std::min(static_cast<int>(block.width4x4),
+                        tile.frame_header().columns4x4 - block.column4x4);
+           i += step) {
+        source_size =
+            tile.Parameters(block.row4x4 - 1, block.column4x4 + i).size;
+        step = std::min(static_cast<int>(block.width4x4),
+                        static_cast<int>(kNum4x4BlocksWide[source_size]));
+        AddSample(block, -1, i, num_warp_samples, num_samples_scanned,
+                  candidates);
+      }
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    BlockSize source_size =
+        tile.Parameters(block.row4x4, block.column4x4 - 1).size;
+    const int source_height4x4 = kNum4x4BlocksHigh[source_size];
+    if (block.height4x4 <= source_height4x4) {
+      const int row_offset = -(block.row4x4 & (source_height4x4 - 1));
+      if (row_offset < 0) top_left = false;
+      AddSample(block, 0, -1, num_warp_samples, num_samples_scanned,
+                candidates);
+    } else {
+      for (int i = 0; i < std::min(static_cast<int>(block.height4x4),
+                                   tile.frame_header().rows4x4 - block.row4x4);
+           i += step) {
+        source_size =
+            tile.Parameters(block.row4x4 + i, block.column4x4 - 1).size;
+        step = std::min(static_cast<int>(block.height4x4),
+                        static_cast<int>(kNum4x4BlocksHigh[source_size]));
+        AddSample(block, i, -1, num_warp_samples, num_samples_scanned,
+                  candidates);
+      }
+    }
+  }
+  if (top_left) {
+    AddSample(block, -1, -1, num_warp_samples, num_samples_scanned, candidates);
+  }
+  if (top_right && block.size <= kBlock64x64) {
+    AddSample(block, -1, block.width4x4, num_warp_samples, num_samples_scanned,
+              candidates);
+  }
+  if (*num_warp_samples == 0 && *num_samples_scanned > 0) *num_warp_samples = 1;
+}
+
+void SetupMotionField(
+    const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+    TemporalMotionField* const motion_field) {
+  assert(frame_header.use_ref_frame_mvs);
+  const int y8_start = DivideBy2(row4x4_start);
+  const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4));
+  const int x8_start = DivideBy2(column4x4_start);
+  const int x8_end =
+      DivideBy2(std::min(column4x4_end, frame_header.columns4x4));
+  const int last_index = frame_header.reference_frame_index[0];
+  const ReferenceInfo& reference_info = *current_frame.reference_info();
+  if (!IsIntraFrame(reference_frames[last_index]->frame_type())) {
+    const int last_alternate_order_hint =
+        reference_frames[last_index]
+            ->reference_info()
+            ->order_hint[kReferenceFrameAlternate];
+    const int current_gold_order_hint =
+        reference_info.order_hint[kReferenceFrameGolden];
+    if (last_alternate_order_hint != current_gold_order_hint) {
+      const int reference_offset_last =
+          -reference_info.relative_distance_from[kReferenceFrameLast];
+      if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
+        MotionFieldProjection(frame_header, reference_frames,
+                              kReferenceFrameLast, reference_offset_last, -1,
+                              y8_start, y8_end, x8_start, x8_end, motion_field);
+      }
+    }
+  }
+  int ref_stamp = 1;
+  const int reference_offset_backward =
+      reference_info.relative_distance_from[kReferenceFrameBackward];
+  if (reference_offset_backward > 0 &&
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameBackward, reference_offset_backward,
+                            0, y8_start, y8_end, x8_start, x8_end,
+                            motion_field)) {
+    --ref_stamp;
+  }
+  const int reference_offset_alternate2 =
+      reference_info.relative_distance_from[kReferenceFrameAlternate2];
+  if (reference_offset_alternate2 > 0 &&
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameAlternate2,
+                            reference_offset_alternate2, 0, y8_start, y8_end,
+                            x8_start, x8_end, motion_field)) {
+    --ref_stamp;
+  }
+  if (ref_stamp >= 0) {
+    const int reference_offset_alternate =
+        reference_info.relative_distance_from[kReferenceFrameAlternate];
+    if (reference_offset_alternate > 0 &&
+        MotionFieldProjection(frame_header, reference_frames,
+                              kReferenceFrameAlternate,
+                              reference_offset_alternate, 0, y8_start, y8_end,
+                              x8_start, x8_end, motion_field)) {
+      --ref_stamp;
+    }
+  }
+  if (ref_stamp >= 0) {
+    const int reference_offset_last2 =
+        -reference_info.relative_distance_from[kReferenceFrameLast2];
+    if (std::abs(reference_offset_last2) <= kMaxFrameDistance) {
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameLast2, reference_offset_last2, -1,
+                            y8_start, y8_end, x8_start, x8_end, motion_field);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/motion_vector.h b/src/motion_vector.h
new file mode 100644
index 0000000..d739e80
--- /dev/null
+++ b/src/motion_vector.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_MOTION_VECTOR_H_
+#define LIBGAV1_SRC_MOTION_VECTOR_H_
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/obu_parser.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr bool IsGlobalMvBlock(bool is_global_mv_block,
+                               GlobalMotionTransformationType type) {
+  return is_global_mv_block &&
+         type > kGlobalMotionTransformationTypeTranslation;
+}
+
+// The |contexts| output parameter may be null. If the caller does not need
+// the |contexts| output, pass nullptr as the argument.
+void FindMvStack(const Tile::Block& block, bool is_compound,
+                 MvContexts* contexts);  // 7.10.2
+
+void FindWarpSamples(const Tile::Block& block, int* num_warp_samples,
+                     int* num_samples_scanned,
+                     int candidates[kMaxLeastSquaresSamples][4]);  // 7.10.4.
+
+// Section 7.9.1 in the spec. But this is done per tile instead of for the whole
+// frame.
+void SetupMotionField(
+    const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+    TemporalMotionField* motion_field);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_MOTION_VECTOR_H_
diff --git a/src/obu_parser.cc b/src/obu_parser.cc
new file mode 100644
index 0000000..bbf00ed
--- /dev/null
+++ b/src/obu_parser.cc
@@ -0,0 +1,2885 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/obu_parser.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/motion_vector.h"
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// 5.9.16.
+// Find the smallest value of k such that block_size << k is greater than or
+// equal to target.
+//
+// NOTE: TileLog2(block_size, target) is equal to
+//   CeilLog2(ceil((double)target / block_size))
+// where the division is a floating-point number division. (This equality holds
+// even when |target| is equal to 0.) In the special case of block_size == 1,
+// TileLog2(1, target) is equal to CeilLog2(target).
+int TileLog2(int block_size, int target) {
+  int k = 0;
+  for (; (block_size << k) < target; ++k) {
+  }
+  return k;
+}
+
+void ParseBitStreamLevel(BitStreamLevel* const level, uint8_t level_bits) {
+  level->major = kMinimumMajorBitstreamLevel + (level_bits >> 2);
+  level->minor = level_bits & 3;
+}
+
+// This function assumes loop_filter is zero-initialized, so only it needs to
+// set the nonzero default values.
+void SetDefaultRefDeltas(LoopFilter* const loop_filter) {
+  loop_filter->ref_deltas[kReferenceFrameIntra] = 1;
+  loop_filter->ref_deltas[kReferenceFrameGolden] = -1;
+  loop_filter->ref_deltas[kReferenceFrameAlternate] = -1;
+  loop_filter->ref_deltas[kReferenceFrameAlternate2] = -1;
+}
+
+bool InTemporalLayer(int operating_point_idc, int temporal_id) {
+  return ((operating_point_idc >> temporal_id) & 1) != 0;
+}
+
+bool InSpatialLayer(int operating_point_idc, int spatial_id) {
+  return ((operating_point_idc >> (spatial_id + 8)) & 1) != 0;
+}
+
+// Returns the index of the last nonzero byte in the |data| buffer of |size|
+// bytes. If there is no nonzero byte in the |data| buffer, returns -1.
+int GetLastNonzeroByteIndex(const uint8_t* data, size_t size) {
+  // Scan backward for a nonzero byte.
+  if (size > INT_MAX) return -1;
+  int i = static_cast<int>(size) - 1;
+  while (i >= 0 && data[i] == 0) {
+    --i;
+  }
+  return i;
+}
+
+// A cleanup helper class that releases the frame buffer reference held in
+// |frame| in the destructor.
+class RefCountedBufferPtrCleanup {
+ public:
+  explicit RefCountedBufferPtrCleanup(RefCountedBufferPtr* frame)
+      : frame_(*frame) {}
+
+  // Not copyable or movable.
+  RefCountedBufferPtrCleanup(const RefCountedBufferPtrCleanup&) = delete;
+  RefCountedBufferPtrCleanup& operator=(const RefCountedBufferPtrCleanup&) =
+      delete;
+
+  ~RefCountedBufferPtrCleanup() { frame_ = nullptr; }
+
+ private:
+  RefCountedBufferPtr& frame_;
+};
+
+}  // namespace
+
+bool ObuSequenceHeader::ParametersChanged(const ObuSequenceHeader& old) const {
+  // Note that the operating_parameters field is not compared per Section 7.5:
+  //   Within a particular coded video sequence, the contents of
+  //   sequence_header_obu must be bit-identical each time the sequence header
+  //   appears except for the contents of operating_parameters_info.
+  return memcmp(this, &old,
+                offsetof(ObuSequenceHeader, operating_parameters)) != 0;
+}
+
+// Macros to avoid repeated error checks in the parser code.
+#define OBU_LOG_AND_RETURN_FALSE                                            \
+  do {                                                                      \
+    LIBGAV1_DLOG(ERROR, "%s:%d (%s): Not enough bits.", __FILE__, __LINE__, \
+                 __func__);                                                 \
+    return false;                                                           \
+  } while (false)
+#define OBU_PARSER_FAIL         \
+  do {                          \
+    if (scratch == -1) {        \
+      OBU_LOG_AND_RETURN_FALSE; \
+    }                           \
+  } while (false)
+#define OBU_READ_BIT_OR_FAIL        \
+  scratch = bit_reader_->ReadBit(); \
+  OBU_PARSER_FAIL
+#define OBU_READ_LITERAL_OR_FAIL(n)      \
+  scratch = bit_reader_->ReadLiteral(n); \
+  OBU_PARSER_FAIL
+#define OBU_READ_UVLC_OR_FAIL(x)        \
+  do {                                  \
+    if (!bit_reader_->ReadUvlc(&(x))) { \
+      OBU_LOG_AND_RETURN_FALSE;         \
+    }                                   \
+  } while (false)
+
+bool ObuParser::ParseColorConfig(ObuSequenceHeader* sequence_header) {
+  int64_t scratch;
+  ColorConfig* const color_config = &sequence_header->color_config;
+  OBU_READ_BIT_OR_FAIL;
+  const auto high_bitdepth = static_cast<bool>(scratch);
+  if (sequence_header->profile == kProfile2 && high_bitdepth) {
+    OBU_READ_BIT_OR_FAIL;
+    const auto is_twelve_bit = static_cast<bool>(scratch);
+    color_config->bitdepth = is_twelve_bit ? 12 : 10;
+  } else {
+    color_config->bitdepth = high_bitdepth ? 10 : 8;
+  }
+  if (sequence_header->profile == kProfile1) {
+    color_config->is_monochrome = false;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    color_config->is_monochrome = static_cast<bool>(scratch);
+  }
+  OBU_READ_BIT_OR_FAIL;
+  const auto color_description_present_flag = static_cast<bool>(scratch);
+  if (color_description_present_flag) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->color_primary = static_cast<ColorPrimary>(scratch);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->transfer_characteristics =
+        static_cast<TransferCharacteristics>(scratch);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->matrix_coefficients =
+        static_cast<MatrixCoefficients>(scratch);
+  } else {
+    color_config->color_primary = kColorPrimaryUnspecified;
+    color_config->transfer_characteristics =
+        kTransferCharacteristicsUnspecified;
+    color_config->matrix_coefficients = kMatrixCoefficientsUnspecified;
+  }
+  if (color_config->is_monochrome) {
+    OBU_READ_BIT_OR_FAIL;
+    color_config->color_range = static_cast<ColorRange>(scratch);
+    // Set subsampling_x and subsampling_y to 1 for monochrome. This makes it
+    // easy to allow monochrome to be supported in profile 0. Profile 0
+    // requires subsampling_x and subsampling_y to be 1.
+    color_config->subsampling_x = 1;
+    color_config->subsampling_y = 1;
+    color_config->chroma_sample_position = kChromaSamplePositionUnknown;
+  } else {
+    if (color_config->color_primary == kColorPrimaryBt709 &&
+        color_config->transfer_characteristics ==
+            kTransferCharacteristicsSrgb &&
+        color_config->matrix_coefficients == kMatrixCoefficientsIdentity) {
+      color_config->color_range = kColorRangeFull;
+      color_config->subsampling_x = 0;
+      color_config->subsampling_y = 0;
+      // YUV 4:4:4 is only allowed in profile 1, or profile 2 with bit depth 12.
+      // See the table at the beginning of Section 6.4.1.
+      if (sequence_header->profile != kProfile1 &&
+          (sequence_header->profile != kProfile2 ||
+           color_config->bitdepth != 12)) {
+        LIBGAV1_DLOG(ERROR,
+                     "YUV 4:4:4 is not allowed in profile %d for bitdepth %d.",
+                     sequence_header->profile, color_config->bitdepth);
+        return false;
+      }
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      color_config->color_range = static_cast<ColorRange>(scratch);
+      if (sequence_header->profile == kProfile0) {
+        color_config->subsampling_x = 1;
+        color_config->subsampling_y = 1;
+      } else if (sequence_header->profile == kProfile1) {
+        color_config->subsampling_x = 0;
+        color_config->subsampling_y = 0;
+      } else {
+        if (color_config->bitdepth == 12) {
+          OBU_READ_BIT_OR_FAIL;
+          color_config->subsampling_x = scratch;
+          if (color_config->subsampling_x == 1) {
+            OBU_READ_BIT_OR_FAIL;
+            color_config->subsampling_y = scratch;
+          } else {
+            color_config->subsampling_y = 0;
+          }
+        } else {
+          color_config->subsampling_x = 1;
+          color_config->subsampling_y = 0;
+        }
+      }
+      if (color_config->subsampling_x == 1 &&
+          color_config->subsampling_y == 1) {
+        OBU_READ_LITERAL_OR_FAIL(2);
+        color_config->chroma_sample_position =
+            static_cast<ChromaSamplePosition>(scratch);
+      }
+    }
+    OBU_READ_BIT_OR_FAIL;
+    color_config->separate_uv_delta_q = static_cast<bool>(scratch);
+  }
+  if (color_config->matrix_coefficients == kMatrixCoefficientsIdentity &&
+      (color_config->subsampling_x != 0 || color_config->subsampling_y != 0)) {
+    LIBGAV1_DLOG(ERROR,
+                 "matrix_coefficients is MC_IDENTITY, but subsampling_x (%d) "
+                 "and subsampling_y (%d) are not both 0.",
+                 color_config->subsampling_x, color_config->subsampling_y);
+    return false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseTimingInfo(ObuSequenceHeader* sequence_header) {
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->timing_info_present_flag = static_cast<bool>(scratch);
+  if (!sequence_header->timing_info_present_flag) return true;
+  TimingInfo* const info = &sequence_header->timing_info;
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->num_units_in_tick = static_cast<uint32_t>(scratch);
+  if (info->num_units_in_tick == 0) {
+    LIBGAV1_DLOG(ERROR, "num_units_in_tick is 0.");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->time_scale = static_cast<uint32_t>(scratch);
+  if (info->time_scale == 0) {
+    LIBGAV1_DLOG(ERROR, "time_scale is 0.");
+    return false;
+  }
+  OBU_READ_BIT_OR_FAIL;
+  info->equal_picture_interval = static_cast<bool>(scratch);
+  if (info->equal_picture_interval) {
+    OBU_READ_UVLC_OR_FAIL(info->num_ticks_per_picture);
+    ++info->num_ticks_per_picture;
+  }
+  return true;
+}
+
+bool ObuParser::ParseDecoderModelInfo(ObuSequenceHeader* sequence_header) {
+  if (!sequence_header->timing_info_present_flag) return true;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->decoder_model_info_present_flag = static_cast<bool>(scratch);
+  if (!sequence_header->decoder_model_info_present_flag) return true;
+  DecoderModelInfo* const info = &sequence_header->decoder_model_info;
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->encoder_decoder_buffer_delay_length = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->num_units_in_decoding_tick = static_cast<uint32_t>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->buffer_removal_time_length = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->frame_presentation_time_length = 1 + scratch;
+  return true;
+}
+
+bool ObuParser::ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+                                         int index) {
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->decoder_model_present_for_operating_point[index] =
+      static_cast<bool>(scratch);
+  if (!sequence_header->decoder_model_present_for_operating_point[index]) {
+    return true;
+  }
+  OperatingParameters* const params = &sequence_header->operating_parameters;
+  OBU_READ_LITERAL_OR_FAIL(
+      sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+  params->decoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(
+      sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+  params->encoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  params->low_delay_mode_flag[index] = static_cast<bool>(scratch);
+  return true;
+}
+
+bool ObuParser::ParseSequenceHeader(bool seen_frame_header) {
+  ObuSequenceHeader sequence_header = {};
+  int64_t scratch;
+  OBU_READ_LITERAL_OR_FAIL(3);
+  if (scratch >= kMaxProfiles) {
+    LIBGAV1_DLOG(ERROR, "Invalid profile: %d.", static_cast<int>(scratch));
+    return false;
+  }
+  sequence_header.profile = static_cast<BitstreamProfile>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.still_picture = static_cast<bool>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.reduced_still_picture_header = static_cast<bool>(scratch);
+  if (sequence_header.reduced_still_picture_header) {
+    if (!sequence_header.still_picture) {
+      LIBGAV1_DLOG(
+          ERROR, "reduced_still_picture_header is 1, but still_picture is 0.");
+      return false;
+    }
+    sequence_header.operating_points = 1;
+    sequence_header.operating_point_idc[0] = 0;
+    OBU_READ_LITERAL_OR_FAIL(5);
+    ParseBitStreamLevel(&sequence_header.level[0], scratch);
+  } else {
+    if (!ParseTimingInfo(&sequence_header) ||
+        !ParseDecoderModelInfo(&sequence_header)) {
+      return false;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    const auto initial_display_delay_present_flag = static_cast<bool>(scratch);
+    OBU_READ_LITERAL_OR_FAIL(5);
+    sequence_header.operating_points = static_cast<int>(1 + scratch);
+    if (operating_point_ >= sequence_header.operating_points) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "Invalid operating point: %d (valid range is [0,%d] inclusive).",
+          operating_point_, sequence_header.operating_points - 1);
+      return false;
+    }
+    for (int i = 0; i < sequence_header.operating_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(12);
+      sequence_header.operating_point_idc[i] = static_cast<int>(scratch);
+      for (int j = 0; j < i; ++j) {
+        if (sequence_header.operating_point_idc[i] ==
+            sequence_header.operating_point_idc[j]) {
+          LIBGAV1_DLOG(ERROR,
+                       "operating_point_idc[%d] (%d) is equal to "
+                       "operating_point_idc[%d] (%d).",
+                       i, sequence_header.operating_point_idc[i], j,
+                       sequence_header.operating_point_idc[j]);
+          return false;
+        }
+      }
+      OBU_READ_LITERAL_OR_FAIL(5);
+      ParseBitStreamLevel(&sequence_header.level[i], scratch);
+      if (sequence_header.level[i].major > 3) {
+        OBU_READ_BIT_OR_FAIL;
+        sequence_header.tier[i] = scratch;
+      }
+      if (sequence_header.decoder_model_info_present_flag &&
+          !ParseOperatingParameters(&sequence_header, i)) {
+        return false;
+      }
+      if (initial_display_delay_present_flag) {
+        OBU_READ_BIT_OR_FAIL;
+        if (static_cast<bool>(scratch)) {
+          OBU_READ_LITERAL_OR_FAIL(4);
+          sequence_header.initial_display_delay[i] = 1 + scratch;
+        }
+      }
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(4);
+  sequence_header.frame_width_bits = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(4);
+  sequence_header.frame_height_bits = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_width_bits);
+  sequence_header.max_frame_width = static_cast<int32_t>(1 + scratch);
+  OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_height_bits);
+  sequence_header.max_frame_height = static_cast<int32_t>(1 + scratch);
+  if (!sequence_header.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.frame_id_numbers_present = static_cast<bool>(scratch);
+  }
+  if (sequence_header.frame_id_numbers_present) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    sequence_header.delta_frame_id_length_bits = 2 + scratch;
+    OBU_READ_LITERAL_OR_FAIL(3);
+    sequence_header.frame_id_length_bits =
+        sequence_header.delta_frame_id_length_bits + 1 + scratch;
+    // Section 6.8.2: It is a requirement of bitstream conformance that the
+    // number of bits needed to read display_frame_id does not exceed 16. This
+    // is equivalent to the constraint that idLen <= 16.
+    if (sequence_header.frame_id_length_bits > 16) {
+      LIBGAV1_DLOG(ERROR, "Invalid frame_id_length_bits: %d.",
+                   sequence_header.frame_id_length_bits);
+      return false;
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.use_128x128_superblock = static_cast<bool>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_filter_intra = static_cast<bool>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_intra_edge_filter = static_cast<bool>(scratch);
+  if (sequence_header.reduced_still_picture_header) {
+    sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+    sequence_header.force_integer_mv = kSelectIntegerMv;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_interintra_compound = static_cast<bool>(scratch);
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_masked_compound = static_cast<bool>(scratch);
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_warped_motion = static_cast<bool>(scratch);
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_dual_filter = static_cast<bool>(scratch);
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_order_hint = static_cast<bool>(scratch);
+    if (sequence_header.enable_order_hint) {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.enable_jnt_comp = static_cast<bool>(scratch);
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.enable_ref_frame_mvs = static_cast<bool>(scratch);
+    }
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.choose_screen_content_tools = static_cast<bool>(scratch);
+    if (sequence_header.choose_screen_content_tools) {
+      sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.force_screen_content_tools = scratch;
+    }
+    if (sequence_header.force_screen_content_tools > 0) {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.choose_integer_mv = static_cast<bool>(scratch);
+      if (sequence_header.choose_integer_mv) {
+        sequence_header.force_integer_mv = kSelectIntegerMv;
+      } else {
+        OBU_READ_BIT_OR_FAIL;
+        sequence_header.force_integer_mv = scratch;
+      }
+    } else {
+      sequence_header.force_integer_mv = kSelectIntegerMv;
+    }
+    if (sequence_header.enable_order_hint) {
+      OBU_READ_LITERAL_OR_FAIL(3);
+      sequence_header.order_hint_bits = 1 + scratch;
+      sequence_header.order_hint_shift_bits =
+          Mod32(32 - sequence_header.order_hint_bits);
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_superres = static_cast<bool>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_cdef = static_cast<bool>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_restoration = static_cast<bool>(scratch);
+  if (!ParseColorConfig(&sequence_header)) return false;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.film_grain_params_present = static_cast<bool>(scratch);
+  // Compare new sequence header with old sequence header.
+  if (has_sequence_header_ &&
+      sequence_header.ParametersChanged(sequence_header_)) {
+    // Between the frame header OBU and the last tile group OBU of the frame,
+    // do not allow the sequence header to change.
+    if (seen_frame_header) {
+      LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
+      return false;
+    }
+    decoder_state_.ClearReferenceFrames();
+  }
+  sequence_header_ = sequence_header;
+  has_sequence_header_ = true;
+  // Section 6.4.1: It is a requirement of bitstream conformance that if
+  // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
+  // all OBUs that follow this sequence header until the next sequence header.
+  extension_disallowed_ =
+      (sequence_header_.operating_point_idc[operating_point_] == 0);
+  return true;
+}
+
+// Marks reference frames as invalid for referencing when they are too far in
+// the past to be referenced by the frame id mechanism.
+void ObuParser::MarkInvalidReferenceFrames() {
+  // The current lower bound of the frame ids for reference frames.
+  int lower_bound = decoder_state_.current_frame_id -
+                    (1 << sequence_header_.delta_frame_id_length_bits);
+  // True if lower_bound is smaller than current_frame_id. False if lower_bound
+  // wraps around (in modular arithmetic) to the other side of current_frame_id.
+  bool lower_bound_is_smaller = true;
+  if (lower_bound <= 0) {
+    lower_bound += 1 << sequence_header_.frame_id_length_bits;
+    lower_bound_is_smaller = false;
+  }
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const uint16_t reference_frame_id = decoder_state_.reference_frame_id[i];
+    if (lower_bound_is_smaller) {
+      if (reference_frame_id > decoder_state_.current_frame_id ||
+          reference_frame_id < lower_bound) {
+        decoder_state_.reference_valid[i] = false;
+      }
+    } else {
+      if (reference_frame_id > decoder_state_.current_frame_id &&
+          reference_frame_id < lower_bound) {
+        decoder_state_.reference_valid[i] = false;
+      }
+    }
+  }
+}
+
+bool ObuParser::ParseFrameSizeAndRenderSize() {
+  int64_t scratch;
+  // Frame Size.
+  if (frame_header_.frame_size_override_flag) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_width_bits);
+    frame_header_.width = static_cast<int32_t>(1 + scratch);
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_height_bits);
+    frame_header_.height = static_cast<int32_t>(1 + scratch);
+    if (frame_header_.width > sequence_header_.max_frame_width ||
+        frame_header_.height > sequence_header_.max_frame_height) {
+      LIBGAV1_DLOG(ERROR,
+                   "Frame dimensions are larger than the maximum values");
+      return false;
+    }
+  } else {
+    frame_header_.width = sequence_header_.max_frame_width;
+    frame_header_.height = sequence_header_.max_frame_height;
+  }
+  if (!ParseSuperResParametersAndComputeImageSize()) return false;
+
+  // Render Size.
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.render_and_frame_size_different = static_cast<bool>(scratch);
+  if (frame_header_.render_and_frame_size_different) {
+    OBU_READ_LITERAL_OR_FAIL(16);
+    frame_header_.render_width = static_cast<int32_t>(1 + scratch);
+    OBU_READ_LITERAL_OR_FAIL(16);
+    frame_header_.render_height = static_cast<int32_t>(1 + scratch);
+  } else {
+    frame_header_.render_width = frame_header_.upscaled_width;
+    frame_header_.render_height = frame_header_.height;
+  }
+
+  return true;
+}
+
+bool ObuParser::ParseSuperResParametersAndComputeImageSize() {
+  int64_t scratch;
+  // SuperRes.
+  frame_header_.upscaled_width = frame_header_.width;
+  frame_header_.use_superres = false;
+  if (sequence_header_.enable_superres) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.use_superres = static_cast<bool>(scratch);
+  }
+  if (frame_header_.use_superres) {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    // 9 is the smallest value for the denominator.
+    frame_header_.superres_scale_denominator = scratch + 9;
+    frame_header_.width =
+        (frame_header_.upscaled_width * kSuperResScaleNumerator +
+         (frame_header_.superres_scale_denominator / 2)) /
+        frame_header_.superres_scale_denominator;
+  } else {
+    frame_header_.superres_scale_denominator = kSuperResScaleNumerator;
+  }
+  assert(frame_header_.width != 0);
+  assert(frame_header_.height != 0);
+  // Check if multiplying upscaled_width by height would overflow.
+  assert(frame_header_.upscaled_width >= frame_header_.width);
+  if (frame_header_.upscaled_width > INT32_MAX / frame_header_.height) {
+    LIBGAV1_DLOG(ERROR, "Frame dimensions too big: width=%d height=%d.",
+                 frame_header_.width, frame_header_.height);
+    return false;
+  }
+  frame_header_.columns4x4 = ((frame_header_.width + 7) >> 3) << 1;
+  frame_header_.rows4x4 = ((frame_header_.height + 7) >> 3) << 1;
+  return true;
+}
+
+bool ObuParser::ValidateInterFrameSize() const {
+  for (int index : frame_header_.reference_frame_index) {
+    const RefCountedBuffer* reference_frame =
+        decoder_state_.reference_frame[index].get();
+    if (2 * frame_header_.width < reference_frame->upscaled_width() ||
+        2 * frame_header_.height < reference_frame->frame_height() ||
+        frame_header_.width > 16 * reference_frame->upscaled_width() ||
+        frame_header_.height > 16 * reference_frame->frame_height()) {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid inter frame size: width=%d, height=%d. Reference "
+                   "frame: index=%d, upscaled width=%d, height=%d.",
+                   frame_header_.width, frame_header_.height, index,
+                   reference_frame->upscaled_width(),
+                   reference_frame->frame_height());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseReferenceOrderHint() {
+  if (!frame_header_.error_resilient_mode ||
+      !sequence_header_.enable_order_hint) {
+    return true;
+  }
+  int64_t scratch;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+    frame_header_.reference_order_hint[i] = scratch;
+    if (frame_header_.reference_order_hint[i] !=
+        decoder_state_.reference_order_hint[i]) {
+      decoder_state_.reference_valid[i] = false;
+    }
+  }
+  return true;
+}
+
+// static
+int ObuParser::FindLatestBackwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int latest_order_hint = INT_MIN;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint >= current_frame_hint &&
+        hint >= latest_order_hint) {
+      ref = i;
+      latest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindEarliestBackwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int earliest_order_hint = INT_MAX;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint >= current_frame_hint &&
+        hint < earliest_order_hint) {
+      ref = i;
+      earliest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindLatestForwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int latest_order_hint = INT_MIN;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint < current_frame_hint &&
+        hint >= latest_order_hint) {
+      ref = i;
+      latest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindReferenceWithSmallestOutputOrder(
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints) {
+  int ref = -1;
+  int earliest_order_hint = INT_MAX;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (hint < earliest_order_hint) {
+      ref = i;
+      earliest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// Computes the elements in the frame_header_.reference_frame_index array
+// based on:
+// * the syntax elements last_frame_idx and gold_frame_idx, and
+// * the values stored within the decoder_state_.reference_order_hint array
+//   (these values represent the least significant bits of the expected output
+//   order of the frames).
+//
+// Frame type: {
+//       libgav1_name              spec_name              int
+//   kReferenceFrameLast,          LAST_FRAME              1
+//   kReferenceFrameLast2,         LAST2_FRAME             2
+//   kReferenceFrameLast3,         LAST3_FRAME             3
+//   kReferenceFrameGolden,        GOLDEN_FRAME            4
+//   kReferenceFrameBackward,      BWDREF_FRAME            5
+//   kReferenceFrameAlternate2,    ALTREF2_FRAME           6
+//   kReferenceFrameAlternate,     ALTREF_FRAME            7
+// }
+//
+// A typical case of a group of pictures (frames) in display order:
+// (However, more complex cases are possibly allowed in terms of
+// bitstream conformance.)
+//
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+//
+// 4         3         2         1   current_frame   5         6         7
+//
+bool ObuParser::SetFrameReferences(const int8_t last_frame_idx,
+                                   const int8_t gold_frame_idx) {
+  // Set the ref_frame_idx entries for kReferenceFrameLast and
+  // kReferenceFrameGolden to last_frame_idx and gold_frame_idx. Initialize
+  // the other entries to -1.
+  for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+    reference_frame_index = -1;
+  }
+  frame_header_
+      .reference_frame_index[kReferenceFrameLast - kReferenceFrameLast] =
+      last_frame_idx;
+  frame_header_
+      .reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast] =
+      gold_frame_idx;
+
+  // used_frame records which reference frames have been used.
+  std::array<bool, kNumReferenceFrameTypes> used_frame;
+  used_frame.fill(false);
+  used_frame[last_frame_idx] = true;
+  used_frame[gold_frame_idx] = true;
+
+  assert(sequence_header_.order_hint_bits >= 1);
+  const int current_frame_hint = 1 << (sequence_header_.order_hint_bits - 1);
+  // shifted_order_hints contains the expected output order shifted such that
+  // the current frame has hint equal to current_frame_hint.
+  std::array<int, kNumReferenceFrameTypes> shifted_order_hints;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int relative_distance = GetRelativeDistance(
+        decoder_state_.reference_order_hint[i], frame_header_.order_hint,
+        sequence_header_.order_hint_shift_bits);
+    shifted_order_hints[i] = current_frame_hint + relative_distance;
+  }
+
+  // The expected output orders for kReferenceFrameLast and
+  // kReferenceFrameGolden.
+  const int last_order_hint = shifted_order_hints[last_frame_idx];
+  const int gold_order_hint = shifted_order_hints[gold_frame_idx];
+
+  // Section 7.8: It is a requirement of bitstream conformance that
+  // lastOrderHint and goldOrderHint are strictly less than curFrameHint.
+  if (last_order_hint >= current_frame_hint ||
+      gold_order_hint >= current_frame_hint) {
+    return false;
+  }
+
+  // Find a backward reference to the frame with highest output order. If
+  // found, set the kReferenceFrameAlternate reference to that backward
+  // reference.
+  int ref = FindLatestBackwardReference(current_frame_hint, shifted_order_hints,
+                                        used_frame);
+  if (ref >= 0) {
+    frame_header_
+        .reference_frame_index[kReferenceFrameAlternate - kReferenceFrameLast] =
+        ref;
+    used_frame[ref] = true;
+  }
+
+  // Find a backward reference to the closest frame. If found, set the
+  // kReferenceFrameBackward reference to that backward reference.
+  ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+                                      used_frame);
+  if (ref >= 0) {
+    frame_header_
+        .reference_frame_index[kReferenceFrameBackward - kReferenceFrameLast] =
+        ref;
+    used_frame[ref] = true;
+  }
+
+  // Set the kReferenceFrameAlternate2 reference to the next closest backward
+  // reference.
+  ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+                                      used_frame);
+  if (ref >= 0) {
+    frame_header_.reference_frame_index[kReferenceFrameAlternate2 -
+                                        kReferenceFrameLast] = ref;
+    used_frame[ref] = true;
+  }
+
+  // The remaining references are set to be forward references in
+  // reverse chronological order.
+  static constexpr ReferenceFrameType
+      kRefFrameList[kNumInterReferenceFrameTypes - 2] = {
+          kReferenceFrameLast2, kReferenceFrameLast3, kReferenceFrameBackward,
+          kReferenceFrameAlternate2, kReferenceFrameAlternate};
+  for (const ReferenceFrameType ref_frame : kRefFrameList) {
+    if (frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] <
+        0) {
+      ref = FindLatestForwardReference(current_frame_hint, shifted_order_hints,
+                                       used_frame);
+      if (ref >= 0) {
+        frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] =
+            ref;
+        used_frame[ref] = true;
+      }
+    }
+  }
+
+  // Finally, any remaining references are set to the reference frame with
+  // smallest output order.
+  ref = FindReferenceWithSmallestOutputOrder(shifted_order_hints);
+  assert(ref >= 0);
+  for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+    if (reference_frame_index < 0) {
+      reference_frame_index = ref;
+    }
+  }
+
+  return true;
+}
+
+bool ObuParser::ParseLoopFilterParameters() {
+  LoopFilter* const loop_filter = &frame_header_.loop_filter;
+  if (frame_header_.coded_lossless || frame_header_.allow_intrabc) {
+    SetDefaultRefDeltas(loop_filter);
+    return true;
+  }
+  // IsIntraFrame implies kPrimaryReferenceNone.
+  assert(!IsIntraFrame(frame_header_.frame_type) ||
+         frame_header_.primary_reference_frame == kPrimaryReferenceNone);
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    // Part of the setup_past_independence() function in the spec. It is not
+    // necessary to set loop_filter->delta_enabled to true. See
+    // https://crbug.com/aomedia/2305.
+    SetDefaultRefDeltas(loop_filter);
+  } else {
+    // Part of the load_previous() function in the spec.
+    const int prev_frame_index =
+        frame_header_
+            .reference_frame_index[frame_header_.primary_reference_frame];
+    const RefCountedBuffer* prev_frame =
+        decoder_state_.reference_frame[prev_frame_index].get();
+    loop_filter->ref_deltas = prev_frame->loop_filter_ref_deltas();
+    loop_filter->mode_deltas = prev_frame->loop_filter_mode_deltas();
+  }
+  int64_t scratch;
+  for (int i = 0; i < 2; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(6);
+    loop_filter->level[i] = scratch;
+  }
+  if (!sequence_header_.color_config.is_monochrome &&
+      (loop_filter->level[0] != 0 || loop_filter->level[1] != 0)) {
+    for (int i = 2; i < 4; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(6);
+      loop_filter->level[i] = scratch;
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(3);
+  loop_filter->sharpness = scratch;
+  OBU_READ_BIT_OR_FAIL;
+  loop_filter->delta_enabled = static_cast<bool>(scratch);
+  if (loop_filter->delta_enabled) {
+    OBU_READ_BIT_OR_FAIL;
+    loop_filter->delta_update = static_cast<bool>(scratch);
+    if (loop_filter->delta_update) {
+      for (auto& ref_delta : loop_filter->ref_deltas) {
+        OBU_READ_BIT_OR_FAIL;
+        const auto update_ref_delta = static_cast<bool>(scratch);
+        if (update_ref_delta) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          ref_delta = scratch_int;
+        }
+      }
+      for (auto& mode_delta : loop_filter->mode_deltas) {
+        OBU_READ_BIT_OR_FAIL;
+        const auto update_mode_delta = static_cast<bool>(scratch);
+        if (update_mode_delta) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          mode_delta = scratch_int;
+        }
+      }
+    }
+  } else {
+    loop_filter->delta_update = false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseDeltaQuantizer(int8_t* const delta) {
+  int64_t scratch;
+  *delta = 0;
+  OBU_READ_BIT_OR_FAIL;
+  const auto delta_coded = static_cast<bool>(scratch);
+  if (delta_coded) {
+    int scratch_int;
+    if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits.");
+      return false;
+    }
+    *delta = scratch_int;
+  }
+  return true;
+}
+
+bool ObuParser::ParseQuantizerParameters() {
+  int64_t scratch;
+  QuantizerParameters* const quantizer = &frame_header_.quantizer;
+  OBU_READ_LITERAL_OR_FAIL(8);
+  quantizer->base_index = scratch;
+  if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneY])) return false;
+  if (!sequence_header_.color_config.is_monochrome) {
+    bool diff_uv_delta = false;
+    if (sequence_header_.color_config.separate_uv_delta_q) {
+      OBU_READ_BIT_OR_FAIL;
+      diff_uv_delta = static_cast<bool>(scratch);
+    }
+    if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneU]) ||
+        !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneU])) {
+      return false;
+    }
+    if (diff_uv_delta) {
+      if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneV]) ||
+          !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneV])) {
+        return false;
+      }
+    } else {
+      quantizer->delta_dc[kPlaneV] = quantizer->delta_dc[kPlaneU];
+      quantizer->delta_ac[kPlaneV] = quantizer->delta_ac[kPlaneU];
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  quantizer->use_matrix = static_cast<bool>(scratch);
+  if (quantizer->use_matrix) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    quantizer->matrix_level[kPlaneY] = scratch;
+    OBU_READ_LITERAL_OR_FAIL(4);
+    quantizer->matrix_level[kPlaneU] = scratch;
+    if (sequence_header_.color_config.separate_uv_delta_q) {
+      OBU_READ_LITERAL_OR_FAIL(4);
+      quantizer->matrix_level[kPlaneV] = scratch;
+    } else {
+      quantizer->matrix_level[kPlaneV] = quantizer->matrix_level[kPlaneU];
+    }
+  }
+  return true;
+}
+
+// This method implements the following functions in the spec:
+// - segmentation_params()
+// - part of setup_past_independence(): Set the FeatureData and FeatureEnabled
+//   arrays to all 0.
+// - part of load_previous(): Call load_segmentation_params().
+//
+// A careful analysis of the spec shows the part of setup_past_independence()
+// can be optimized away and the part of load_previous() only needs to be
+// invoked under a specific condition. Although the logic looks different from
+// the spec, it is equivalent and more efficient.
+bool ObuParser::ParseSegmentationParameters() {
+  int64_t scratch;
+  Segmentation* const segmentation = &frame_header_.segmentation;
+  OBU_READ_BIT_OR_FAIL;
+  segmentation->enabled = static_cast<bool>(scratch);
+  if (!segmentation->enabled) return true;
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    segmentation->update_map = true;
+    segmentation->update_data = true;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    segmentation->update_map = static_cast<bool>(scratch);
+    if (segmentation->update_map) {
+      OBU_READ_BIT_OR_FAIL;
+      segmentation->temporal_update = static_cast<bool>(scratch);
+    }
+    OBU_READ_BIT_OR_FAIL;
+    segmentation->update_data = static_cast<bool>(scratch);
+    if (!segmentation->update_data) {
+      // Part of the load_previous() function in the spec.
+      const int prev_frame_index =
+          frame_header_
+              .reference_frame_index[frame_header_.primary_reference_frame];
+      decoder_state_.reference_frame[prev_frame_index]
+          ->GetSegmentationParameters(segmentation);
+      return true;
+    }
+  }
+  for (int8_t i = 0; i < kMaxSegments; ++i) {
+    for (int8_t j = 0; j < kSegmentFeatureMax; ++j) {
+      OBU_READ_BIT_OR_FAIL;
+      segmentation->feature_enabled[i][j] = static_cast<bool>(scratch);
+      if (segmentation->feature_enabled[i][j]) {
+        if (Segmentation::FeatureSigned(static_cast<SegmentFeature>(j))) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(
+                  kSegmentationFeatureBits[j], &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          segmentation->feature_data[i][j] =
+              Clip3(scratch_int, -kSegmentationFeatureMaxValues[j],
+                    kSegmentationFeatureMaxValues[j]);
+        } else {
+          if (kSegmentationFeatureBits[j] > 0) {
+            OBU_READ_LITERAL_OR_FAIL(kSegmentationFeatureBits[j]);
+            segmentation->feature_data[i][j] = Clip3(
+                static_cast<int>(scratch), 0, kSegmentationFeatureMaxValues[j]);
+          } else {
+            segmentation->feature_data[i][j] = 0;
+          }
+        }
+        segmentation->last_active_segment_id = i;
+        if (j >= kSegmentFeatureReferenceFrame) {
+          segmentation->segment_id_pre_skip = true;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseQuantizerIndexDeltaParameters() {
+  int64_t scratch;
+  if (frame_header_.quantizer.base_index > 0) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.delta_q.present = static_cast<bool>(scratch);
+    if (frame_header_.delta_q.present) {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.delta_q.scale = scratch;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseLoopFilterDeltaParameters() {
+  int64_t scratch;
+  if (frame_header_.delta_q.present) {
+    if (!frame_header_.allow_intrabc) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.delta_lf.present = static_cast<bool>(scratch);
+    }
+    if (frame_header_.delta_lf.present) {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.delta_lf.scale = scratch;
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.delta_lf.multi = static_cast<bool>(scratch);
+    }
+  }
+  return true;
+}
+
+void ObuParser::ComputeSegmentLosslessAndQIndex() {
+  frame_header_.coded_lossless = true;
+  Segmentation* const segmentation = &frame_header_.segmentation;
+  const QuantizerParameters* const quantizer = &frame_header_.quantizer;
+  for (int i = 0; i < kMaxSegments; ++i) {
+    segmentation->qindex[i] =
+        GetQIndex(*segmentation, i, quantizer->base_index);
+    segmentation->lossless[i] =
+        segmentation->qindex[i] == 0 && quantizer->delta_dc[kPlaneY] == 0 &&
+        quantizer->delta_dc[kPlaneU] == 0 &&
+        quantizer->delta_ac[kPlaneU] == 0 &&
+        quantizer->delta_dc[kPlaneV] == 0 && quantizer->delta_ac[kPlaneV] == 0;
+    if (!segmentation->lossless[i]) frame_header_.coded_lossless = false;
+    // The spec calls for setting up a two-dimensional SegQMLevel array here.
+    // We avoid the SegQMLevel array by using segmentation->lossless[i] and
+    // quantizer->matrix_level[plane] directly in the reconstruct process of
+    // Section 7.12.3.
+  }
+  frame_header_.upscaled_lossless =
+      frame_header_.coded_lossless &&
+      frame_header_.width == frame_header_.upscaled_width;
+}
+
+bool ObuParser::ParseCdefParameters() {
+  const int coeff_shift = sequence_header_.color_config.bitdepth - 8;
+  if (frame_header_.coded_lossless || frame_header_.allow_intrabc ||
+      !sequence_header_.enable_cdef) {
+    frame_header_.cdef.damping = 3 + coeff_shift;
+    return true;
+  }
+  Cdef* const cdef = &frame_header_.cdef;
+  int64_t scratch;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  cdef->damping = scratch + 3 + coeff_shift;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  cdef->bits = scratch;
+  for (int i = 0; i < (1 << cdef->bits); ++i) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    cdef->y_primary_strength[i] = scratch << coeff_shift;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    cdef->y_secondary_strength[i] = scratch;
+    if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i];
+    cdef->y_secondary_strength[i] <<= coeff_shift;
+    if (sequence_header_.color_config.is_monochrome) continue;
+    OBU_READ_LITERAL_OR_FAIL(4);
+    cdef->uv_primary_strength[i] = scratch << coeff_shift;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    cdef->uv_secondary_strength[i] = scratch;
+    if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i];
+    cdef->uv_secondary_strength[i] <<= coeff_shift;
+  }
+  return true;
+}
+
+bool ObuParser::ParseLoopRestorationParameters() {
+  if (frame_header_.upscaled_lossless || frame_header_.allow_intrabc ||
+      !sequence_header_.enable_restoration) {
+    return true;
+  }
+  int64_t scratch;
+  bool uses_loop_restoration = false;
+  bool uses_chroma_loop_restoration = false;
+  LoopRestoration* const loop_restoration = &frame_header_.loop_restoration;
+  const int num_planes = sequence_header_.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  for (int i = 0; i < num_planes; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(2);
+    loop_restoration->type[i] = static_cast<LoopRestorationType>(scratch);
+    if (loop_restoration->type[i] != kLoopRestorationTypeNone) {
+      uses_loop_restoration = true;
+      if (i > 0) uses_chroma_loop_restoration = true;
+    }
+  }
+  if (uses_loop_restoration) {
+    uint8_t unit_shift;
+    if (sequence_header_.use_128x128_superblock) {
+      OBU_READ_BIT_OR_FAIL;
+      unit_shift = scratch + 1;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      unit_shift = scratch;
+      if (unit_shift != 0) {
+        OBU_READ_BIT_OR_FAIL;
+        const uint8_t unit_extra_shift = scratch;
+        unit_shift += unit_extra_shift;
+      }
+    }
+    loop_restoration->unit_size_log2[kPlaneY] = 6 + unit_shift;
+    uint8_t uv_shift = 0;
+    if (sequence_header_.color_config.subsampling_x != 0 &&
+        sequence_header_.color_config.subsampling_y != 0 &&
+        uses_chroma_loop_restoration) {
+      OBU_READ_BIT_OR_FAIL;
+      uv_shift = scratch;
+    }
+    loop_restoration->unit_size_log2[kPlaneU] =
+        loop_restoration->unit_size_log2[kPlaneV] =
+            loop_restoration->unit_size_log2[0] - uv_shift;
+  }
+  return true;
+}
+
+bool ObuParser::ParseTxModeSyntax() {
+  if (frame_header_.coded_lossless) {
+    frame_header_.tx_mode = kTxModeOnly4x4;
+    return true;
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.tx_mode = (scratch == 1) ? kTxModeSelect : kTxModeLargest;
+  return true;
+}
+
+bool ObuParser::ParseFrameReferenceModeSyntax() {
+  int64_t scratch;
+  if (!IsIntraFrame(frame_header_.frame_type)) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.reference_mode_select = static_cast<bool>(scratch);
+  }
+  return true;
+}
+
+bool ObuParser::IsSkipModeAllowed() {
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      !frame_header_.reference_mode_select ||
+      !sequence_header_.enable_order_hint) {
+    return false;
+  }
+  // Identify the nearest forward and backward references.
+  int forward_index = -1;
+  int backward_index = -1;
+  int forward_hint = -1;
+  int backward_hint = -1;
+  for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+    const unsigned int reference_hint =
+        decoder_state_
+            .reference_order_hint[frame_header_.reference_frame_index[i]];
+    // TODO(linfengz): |relative_distance| equals
+    // current_frame_->reference_info()->
+    //     relative_distance_from[i + kReferenceFrameLast];
+    // However, the unit test ObuParserTest.SkipModeParameters() would fail.
+    // Will figure out how to initialize |current_frame_.reference_info_| in the
+    // RefCountedBuffer later.
+    const int relative_distance =
+        GetRelativeDistance(reference_hint, frame_header_.order_hint,
+                            sequence_header_.order_hint_shift_bits);
+    if (relative_distance < 0) {
+      if (forward_index < 0 ||
+          GetRelativeDistance(reference_hint, forward_hint,
+                              sequence_header_.order_hint_shift_bits) > 0) {
+        forward_index = i;
+        forward_hint = reference_hint;
+      }
+    } else if (relative_distance > 0) {
+      if (backward_index < 0 ||
+          GetRelativeDistance(reference_hint, backward_hint,
+                              sequence_header_.order_hint_shift_bits) < 0) {
+        backward_index = i;
+        backward_hint = reference_hint;
+      }
+    }
+  }
+  if (forward_index < 0) return false;
+  if (backward_index >= 0) {
+    // Bidirectional prediction.
+    frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+        kReferenceFrameLast + std::min(forward_index, backward_index));
+    frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+        kReferenceFrameLast + std::max(forward_index, backward_index));
+    return true;
+  }
+  // Forward prediction only. Identify the second nearest forward reference.
+  int second_forward_index = -1;
+  int second_forward_hint = -1;
+  for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+    const unsigned int reference_hint =
+        decoder_state_
+            .reference_order_hint[frame_header_.reference_frame_index[i]];
+    if (GetRelativeDistance(reference_hint, forward_hint,
+                            sequence_header_.order_hint_shift_bits) < 0) {
+      if (second_forward_index < 0 ||
+          GetRelativeDistance(reference_hint, second_forward_hint,
+                              sequence_header_.order_hint_shift_bits) > 0) {
+        second_forward_index = i;
+        second_forward_hint = reference_hint;
+      }
+    }
+  }
+  if (second_forward_index < 0) return false;
+  frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+      kReferenceFrameLast + std::min(forward_index, second_forward_index));
+  frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+      kReferenceFrameLast + std::max(forward_index, second_forward_index));
+  return true;
+}
+
+bool ObuParser::ParseSkipModeParameters() {
+  if (!IsSkipModeAllowed()) return true;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.skip_mode_present = static_cast<bool>(scratch);
+  return true;
+}
+
+// Sets frame_header_.global_motion[ref].params[index].
+bool ObuParser::ParseGlobalParamSyntax(
+    int ref, int index,
+    const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+        prev_global_motions) {
+  GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+  const GlobalMotion* const prev_global_motion = &prev_global_motions[ref];
+  int abs_bits = kGlobalMotionAlphaBits;
+  int precision_bits = kGlobalMotionAlphaPrecisionBits;
+  if (index < 2) {
+    if (global_motion->type == kGlobalMotionTransformationTypeTranslation) {
+      const auto high_precision_mv_factor =
+          static_cast<int>(!frame_header_.allow_high_precision_mv);
+      abs_bits = kGlobalMotionTranslationOnlyBits - high_precision_mv_factor;
+      precision_bits =
+          kGlobalMotionTranslationOnlyPrecisionBits - high_precision_mv_factor;
+    } else {
+      abs_bits = kGlobalMotionTranslationBits;
+      precision_bits = kGlobalMotionTranslationPrecisionBits;
+    }
+  }
+  const int precision_diff = kWarpedModelPrecisionBits - precision_bits;
+  const int round = (index % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+  const int sub = (index % 3 == 2) ? 1 << precision_bits : 0;
+  const int mx = 1 << abs_bits;
+  const int reference =
+      (prev_global_motion->params[index] >> precision_diff) - sub;
+  int scratch;
+  if (!bit_reader_->DecodeSignedSubexpWithReference(
+          -mx, mx + 1, reference, kGlobalMotionReadControl, &scratch)) {
+    LIBGAV1_DLOG(ERROR, "Not enough bits.");
+    return false;
+  }
+  global_motion->params[index] = LeftShift(scratch, precision_diff) + round;
+  return true;
+}
+
+bool ObuParser::ParseGlobalMotionParameters() {
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    frame_header_.global_motion[ref].type =
+        kGlobalMotionTransformationTypeIdentity;
+    for (int i = 0; i < 6; ++i) {
+      frame_header_.global_motion[ref].params[i] =
+          (i % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+    }
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) return true;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>* prev_global_motions =
+      nullptr;
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    // Part of the setup_past_independence() function in the spec. The value
+    // that the spec says PrevGmParams[ref][i] should be set to is exactly
+    // the value frame_header_.global_motion[ref].params[i] is set to by the
+    // for loop above. Therefore prev_global_motions can simply point to
+    // frame_header_.global_motion.
+    prev_global_motions = &frame_header_.global_motion;
+  } else {
+    // Part of the load_previous() function in the spec.
+    const int prev_frame_index =
+        frame_header_
+            .reference_frame_index[frame_header_.primary_reference_frame];
+    prev_global_motions =
+        &decoder_state_.reference_frame[prev_frame_index]->GlobalMotions();
+  }
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+    int64_t scratch;
+    OBU_READ_BIT_OR_FAIL;
+    const auto is_global = static_cast<bool>(scratch);
+    if (is_global) {
+      OBU_READ_BIT_OR_FAIL;
+      const auto is_rot_zoom = static_cast<bool>(scratch);
+      if (is_rot_zoom) {
+        global_motion->type = kGlobalMotionTransformationTypeRotZoom;
+      } else {
+        OBU_READ_BIT_OR_FAIL;
+        const auto is_translation = static_cast<bool>(scratch);
+        global_motion->type = is_translation
+                                  ? kGlobalMotionTransformationTypeTranslation
+                                  : kGlobalMotionTransformationTypeAffine;
+      }
+    } else {
+      global_motion->type = kGlobalMotionTransformationTypeIdentity;
+    }
+    if (global_motion->type >= kGlobalMotionTransformationTypeRotZoom) {
+      if (!ParseGlobalParamSyntax(ref, 2, *prev_global_motions) ||
+          !ParseGlobalParamSyntax(ref, 3, *prev_global_motions)) {
+        return false;
+      }
+      if (global_motion->type == kGlobalMotionTransformationTypeAffine) {
+        if (!ParseGlobalParamSyntax(ref, 4, *prev_global_motions) ||
+            !ParseGlobalParamSyntax(ref, 5, *prev_global_motions)) {
+          return false;
+        }
+      } else {
+        global_motion->params[4] = -global_motion->params[3];
+        global_motion->params[5] = global_motion->params[2];
+      }
+    }
+    if (global_motion->type >= kGlobalMotionTransformationTypeTranslation) {
+      if (!ParseGlobalParamSyntax(ref, 0, *prev_global_motions) ||
+          !ParseGlobalParamSyntax(ref, 1, *prev_global_motions)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseFilmGrainParameters() {
+  if (!sequence_header_.film_grain_params_present ||
+      (!frame_header_.show_frame && !frame_header_.showable_frame)) {
+    // frame_header_.film_grain_params is already zero-initialized.
+    return true;
+  }
+
+  FilmGrainParams& film_grain_params = frame_header_.film_grain_params;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.apply_grain = static_cast<bool>(scratch);
+  if (!film_grain_params.apply_grain) {
+    // film_grain_params is already zero-initialized.
+    return true;
+  }
+
+  OBU_READ_LITERAL_OR_FAIL(16);
+  film_grain_params.grain_seed = static_cast<int>(scratch);
+  film_grain_params.update_grain = true;
+  if (frame_header_.frame_type == kFrameInter) {
+    OBU_READ_BIT_OR_FAIL;
+    film_grain_params.update_grain = static_cast<bool>(scratch);
+  }
+  if (!film_grain_params.update_grain) {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    film_grain_params.reference_index = static_cast<int>(scratch);
+    bool found = false;
+    for (const auto index : frame_header_.reference_frame_index) {
+      if (film_grain_params.reference_index == index) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      static_assert(sizeof(frame_header_.reference_frame_index) /
+                            sizeof(frame_header_.reference_frame_index[0]) ==
+                        7,
+                    "");
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid value for film_grain_params_ref_idx (%d). "
+                   "ref_frame_idx = {%d, %d, %d, %d, %d, %d, %d}",
+                   film_grain_params.reference_index,
+                   frame_header_.reference_frame_index[0],
+                   frame_header_.reference_frame_index[1],
+                   frame_header_.reference_frame_index[2],
+                   frame_header_.reference_frame_index[3],
+                   frame_header_.reference_frame_index[4],
+                   frame_header_.reference_frame_index[5],
+                   frame_header_.reference_frame_index[6]);
+      return false;
+    }
+    const RefCountedBuffer* grain_params_reference_frame =
+        decoder_state_.reference_frame[film_grain_params.reference_index].get();
+    if (grain_params_reference_frame == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+                   film_grain_params.reference_index);
+      return false;
+    }
+    const int temp_grain_seed = film_grain_params.grain_seed;
+    const bool temp_update_grain = film_grain_params.update_grain;
+    const int temp_reference_index = film_grain_params.reference_index;
+    film_grain_params = grain_params_reference_frame->film_grain_params();
+    film_grain_params.grain_seed = temp_grain_seed;
+    film_grain_params.update_grain = temp_update_grain;
+    film_grain_params.reference_index = temp_reference_index;
+    return true;
+  }
+
+  OBU_READ_LITERAL_OR_FAIL(4);
+  film_grain_params.num_y_points = scratch;
+  if (film_grain_params.num_y_points > 14) {
+    LIBGAV1_DLOG(ERROR, "Invalid value for num_y_points (%d).",
+                 film_grain_params.num_y_points);
+    return false;
+  }
+  for (int i = 0; i < film_grain_params.num_y_points; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.point_y_value[i] = scratch;
+    if (i != 0 && film_grain_params.point_y_value[i - 1] >=
+                      film_grain_params.point_y_value[i]) {
+      LIBGAV1_DLOG(ERROR, "point_y_value[%d] (%d) >= point_y_value[%d] (%d).",
+                   i - 1, film_grain_params.point_y_value[i - 1], i,
+                   film_grain_params.point_y_value[i]);
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.point_y_scaling[i] = scratch;
+  }
+  if (sequence_header_.color_config.is_monochrome) {
+    film_grain_params.chroma_scaling_from_luma = false;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    film_grain_params.chroma_scaling_from_luma = static_cast<bool>(scratch);
+  }
+  if (sequence_header_.color_config.is_monochrome ||
+      film_grain_params.chroma_scaling_from_luma ||
+      (sequence_header_.color_config.subsampling_x == 1 &&
+       sequence_header_.color_config.subsampling_y == 1 &&
+       film_grain_params.num_y_points == 0)) {
+    film_grain_params.num_u_points = 0;
+    film_grain_params.num_v_points = 0;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    film_grain_params.num_u_points = scratch;
+    if (film_grain_params.num_u_points > 10) {
+      LIBGAV1_DLOG(ERROR, "Invalid value for num_u_points (%d).",
+                   film_grain_params.num_u_points);
+      return false;
+    }
+    for (int i = 0; i < film_grain_params.num_u_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_u_value[i] = scratch;
+      if (i != 0 && film_grain_params.point_u_value[i - 1] >=
+                        film_grain_params.point_u_value[i]) {
+        LIBGAV1_DLOG(ERROR, "point_u_value[%d] (%d) >= point_u_value[%d] (%d).",
+                     i - 1, film_grain_params.point_u_value[i - 1], i,
+                     film_grain_params.point_u_value[i]);
+        return false;
+      }
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_u_scaling[i] = scratch;
+    }
+    OBU_READ_LITERAL_OR_FAIL(4);
+    film_grain_params.num_v_points = scratch;
+    if (film_grain_params.num_v_points > 10) {
+      LIBGAV1_DLOG(ERROR, "Invalid value for num_v_points (%d).",
+                   film_grain_params.num_v_points);
+      return false;
+    }
+    if (sequence_header_.color_config.subsampling_x == 1 &&
+        sequence_header_.color_config.subsampling_y == 1 &&
+        (film_grain_params.num_u_points == 0) !=
+            (film_grain_params.num_v_points == 0)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid values for num_u_points (%d) and num_v_points (%d) "
+                   "for 4:2:0 chroma subsampling.",
+                   film_grain_params.num_u_points,
+                   film_grain_params.num_v_points);
+      return false;
+    }
+    for (int i = 0; i < film_grain_params.num_v_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_v_value[i] = scratch;
+      if (i != 0 && film_grain_params.point_v_value[i - 1] >=
+                        film_grain_params.point_v_value[i]) {
+        LIBGAV1_DLOG(ERROR, "point_v_value[%d] (%d) >= point_v_value[%d] (%d).",
+                     i - 1, film_grain_params.point_v_value[i - 1], i,
+                     film_grain_params.point_v_value[i]);
+        return false;
+      }
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_v_scaling[i] = scratch;
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.chroma_scaling = scratch + 8;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.auto_regression_coeff_lag = scratch;
+
+  const int num_pos_y =
+      MultiplyBy2(film_grain_params.auto_regression_coeff_lag) *
+      (film_grain_params.auto_regression_coeff_lag + 1);
+  int num_pos_uv = num_pos_y;
+  if (film_grain_params.num_y_points > 0) {
+    ++num_pos_uv;
+    for (int i = 0; i < num_pos_y; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_y[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  if (film_grain_params.chroma_scaling_from_luma ||
+      film_grain_params.num_u_points > 0) {
+    for (int i = 0; i < num_pos_uv; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_u[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  if (film_grain_params.chroma_scaling_from_luma ||
+      film_grain_params.num_v_points > 0) {
+    for (int i = 0; i < num_pos_uv; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_v[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.auto_regression_shift = static_cast<uint8_t>(scratch + 6);
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.grain_scale_shift = static_cast<int>(scratch);
+  if (film_grain_params.num_u_points > 0) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.u_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.u_luma_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(9);
+    film_grain_params.u_offset = static_cast<int16_t>(scratch - 256);
+  }
+  if (film_grain_params.num_v_points > 0) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.v_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.v_luma_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(9);
+    film_grain_params.v_offset = static_cast<int16_t>(scratch - 256);
+  }
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.overlap_flag = static_cast<bool>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.clip_to_restricted_range = static_cast<bool>(scratch);
+  return true;
+}
+
+bool ObuParser::ParseTileInfoSyntax() {
+  TileInfo* const tile_info = &frame_header_.tile_info;
+  const int sb_columns = sequence_header_.use_128x128_superblock
+                             ? ((frame_header_.columns4x4 + 31) >> 5)
+                             : ((frame_header_.columns4x4 + 15) >> 4);
+  const int sb_rows = sequence_header_.use_128x128_superblock
+                          ? ((frame_header_.rows4x4 + 31) >> 5)
+                          : ((frame_header_.rows4x4 + 15) >> 4);
+  tile_info->sb_columns = sb_columns;
+  tile_info->sb_rows = sb_rows;
+  const int sb_shift = sequence_header_.use_128x128_superblock ? 5 : 4;
+  const int sb_size = 2 + sb_shift;
+  const int sb_max_tile_width = kMaxTileWidth >> sb_size;
+  const int sb_max_tile_area = kMaxTileArea >> MultiplyBy2(sb_size);
+  const int minlog2_tile_columns = TileLog2(sb_max_tile_width, sb_columns);
+  const int maxlog2_tile_columns =
+      CeilLog2(std::min(sb_columns, static_cast<int>(kMaxTileColumns)));
+  const int maxlog2_tile_rows =
+      CeilLog2(std::min(sb_rows, static_cast<int>(kMaxTileRows)));
+  const int min_log2_tiles = std::max(
+      minlog2_tile_columns, TileLog2(sb_max_tile_area, sb_rows * sb_columns));
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  tile_info->uniform_spacing = static_cast<bool>(scratch);
+  if (tile_info->uniform_spacing) {
+    // Read tile columns.
+    tile_info->tile_columns_log2 = minlog2_tile_columns;
+    while (tile_info->tile_columns_log2 < maxlog2_tile_columns) {
+      OBU_READ_BIT_OR_FAIL;
+      if (scratch == 0) break;
+      ++tile_info->tile_columns_log2;
+    }
+
+    // Compute tile column starts.
+    const int sb_tile_width =
+        (sb_columns + (1 << tile_info->tile_columns_log2) - 1) >>
+        tile_info->tile_columns_log2;
+    if (sb_tile_width <= 0) return false;
+    int i = 0;
+    for (int sb_start = 0; sb_start < sb_columns; sb_start += sb_tile_width) {
+      if (i >= kMaxTileColumns) {
+        LIBGAV1_DLOG(ERROR,
+                     "tile_columns would be greater than kMaxTileColumns.");
+        return false;
+      }
+      tile_info->tile_column_start[i++] = sb_start << sb_shift;
+    }
+    tile_info->tile_column_start[i] = frame_header_.columns4x4;
+    tile_info->tile_columns = i;
+
+    // Read tile rows.
+    const int minlog2_tile_rows =
+        std::max(min_log2_tiles - tile_info->tile_columns_log2, 0);
+    tile_info->tile_rows_log2 = minlog2_tile_rows;
+    while (tile_info->tile_rows_log2 < maxlog2_tile_rows) {
+      OBU_READ_BIT_OR_FAIL;
+      if (scratch == 0) break;
+      ++tile_info->tile_rows_log2;
+    }
+
+    // Compute tile row starts.
+    const int sb_tile_height =
+        (sb_rows + (1 << tile_info->tile_rows_log2) - 1) >>
+        tile_info->tile_rows_log2;
+    if (sb_tile_height <= 0) return false;
+    i = 0;
+    for (int sb_start = 0; sb_start < sb_rows; sb_start += sb_tile_height) {
+      if (i >= kMaxTileRows) {
+        LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+        return false;
+      }
+      tile_info->tile_row_start[i++] = sb_start << sb_shift;
+    }
+    tile_info->tile_row_start[i] = frame_header_.rows4x4;
+    tile_info->tile_rows = i;
+  } else {
+    int widest_tile_sb = 1;
+    int i = 0;
+    for (int sb_start = 0; sb_start < sb_columns; ++i) {
+      if (i >= kMaxTileColumns) {
+        LIBGAV1_DLOG(ERROR,
+                     "tile_columns would be greater than kMaxTileColumns.");
+        return false;
+      }
+      tile_info->tile_column_start[i] = sb_start << sb_shift;
+      const int max_width =
+          std::min(sb_columns - sb_start, static_cast<int>(sb_max_tile_width));
+      if (!bit_reader_->DecodeUniform(
+              max_width, &tile_info->tile_column_width_in_superblocks[i])) {
+        LIBGAV1_DLOG(ERROR, "Not enough bits.");
+        return false;
+      }
+      ++tile_info->tile_column_width_in_superblocks[i];
+      widest_tile_sb = std::max(tile_info->tile_column_width_in_superblocks[i],
+                                widest_tile_sb);
+      sb_start += tile_info->tile_column_width_in_superblocks[i];
+    }
+    tile_info->tile_column_start[i] = frame_header_.columns4x4;
+    tile_info->tile_columns = i;
+    tile_info->tile_columns_log2 = CeilLog2(tile_info->tile_columns);
+
+    int max_tile_area_sb = sb_rows * sb_columns;
+    if (min_log2_tiles > 0) max_tile_area_sb >>= min_log2_tiles + 1;
+    const int max_tile_height_sb =
+        std::max(max_tile_area_sb / widest_tile_sb, 1);
+
+    i = 0;
+    for (int sb_start = 0; sb_start < sb_rows; ++i) {
+      if (i >= kMaxTileRows) {
+        LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+        return false;
+      }
+      tile_info->tile_row_start[i] = sb_start << sb_shift;
+      const int max_height = std::min(sb_rows - sb_start, max_tile_height_sb);
+      if (!bit_reader_->DecodeUniform(
+              max_height, &tile_info->tile_row_height_in_superblocks[i])) {
+        LIBGAV1_DLOG(ERROR, "Not enough bits.");
+        return false;
+      }
+      ++tile_info->tile_row_height_in_superblocks[i];
+      sb_start += tile_info->tile_row_height_in_superblocks[i];
+    }
+    tile_info->tile_row_start[i] = frame_header_.rows4x4;
+    tile_info->tile_rows = i;
+    tile_info->tile_rows_log2 = CeilLog2(tile_info->tile_rows);
+  }
+  tile_info->tile_count = tile_info->tile_rows * tile_info->tile_columns;
+  if (!tile_buffers_.reserve(tile_info->tile_count)) {
+    LIBGAV1_DLOG(ERROR, "Unable to allocate memory for tile_buffers_.");
+    return false;
+  }
+  tile_info->context_update_id = 0;
+  const int tile_bits =
+      tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+  if (tile_bits != 0) {
+    OBU_READ_LITERAL_OR_FAIL(tile_bits);
+    tile_info->context_update_id = static_cast<int16_t>(scratch);
+    if (tile_info->context_update_id >= tile_info->tile_count) {
+      LIBGAV1_DLOG(ERROR, "Invalid context_update_tile_id (%d) >= %d.",
+                   tile_info->context_update_id, tile_info->tile_count);
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(2);
+    tile_info->tile_size_bytes = 1 + scratch;
+  }
+  return true;
+}
+
+bool ObuParser::ReadAllowWarpedMotion() {
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      frame_header_.error_resilient_mode ||
+      !sequence_header_.enable_warped_motion) {
+    return true;
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.allow_warped_motion = static_cast<bool>(scratch);
+  return true;
+}
+
+bool ObuParser::ParseFrameParameters() {
+  int64_t scratch;
+  if (sequence_header_.reduced_still_picture_header) {
+    frame_header_.show_frame = true;
+    current_frame_ = buffer_pool_->GetFreeBuffer();
+    if (current_frame_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+      return false;
+    }
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.show_existing_frame = static_cast<bool>(scratch);
+    if (frame_header_.show_existing_frame) {
+      OBU_READ_LITERAL_OR_FAIL(3);
+      frame_header_.frame_to_show = scratch;
+      if (sequence_header_.decoder_model_info_present_flag &&
+          !sequence_header_.timing_info.equal_picture_interval) {
+        OBU_READ_LITERAL_OR_FAIL(
+            sequence_header_.decoder_model_info.frame_presentation_time_length);
+        frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+      }
+      if (sequence_header_.frame_id_numbers_present) {
+        OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+        frame_header_.display_frame_id = static_cast<uint16_t>(scratch);
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // whenever display_frame_id is read, the value matches
+        // RefFrameId[ frame_to_show_map_idx ] ..., and that
+        // RefValid[ frame_to_show_map_idx ] is equal to 1.
+        if (frame_header_.display_frame_id !=
+                decoder_state_
+                    .reference_frame_id[frame_header_.frame_to_show] ||
+            !decoder_state_.reference_valid[frame_header_.frame_to_show]) {
+          LIBGAV1_DLOG(ERROR,
+                       "Reference buffer %d has a frame id number mismatch.",
+                       frame_header_.frame_to_show);
+          return false;
+        }
+      }
+      // Section 7.18.2. Note: This is also needed for Section 7.21 if
+      // frame_type is kFrameKey.
+      current_frame_ =
+          decoder_state_.reference_frame[frame_header_.frame_to_show];
+      if (current_frame_ == nullptr) {
+        LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+                     frame_header_.frame_to_show);
+        return false;
+      }
+      // Section 6.8.2: It is a requirement of bitstream conformance that
+      // when show_existing_frame is used to show a previous frame, that the
+      // value of showable_frame for the previous frame was equal to 1.
+      if (!current_frame_->showable_frame()) {
+        LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a showable frame",
+                     frame_header_.frame_to_show);
+        return false;
+      }
+      if (current_frame_->frame_type() == kFrameKey) {
+        frame_header_.refresh_frame_flags = 0xff;
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // when show_existing_frame is used to show a previous frame with
+        // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that
+        // the frame is output via the show_existing_frame mechanism at most
+        // once.
+        current_frame_->set_showable_frame(false);
+
+        // Section 7.21. Note: decoder_state_.current_frame_id must be set
+        // only when frame_type is kFrameKey per the spec. Among all the
+        // variables set in Section 7.21, current_frame_id is the only one
+        // whose value lives across frames. (PrevFrameID is set equal to the
+        // current_frame_id value for the previous frame.)
+        decoder_state_.current_frame_id =
+            decoder_state_.reference_frame_id[frame_header_.frame_to_show];
+        decoder_state_.order_hint =
+            decoder_state_.reference_order_hint[frame_header_.frame_to_show];
+      }
+      return true;
+    }
+    current_frame_ = buffer_pool_->GetFreeBuffer();
+    if (current_frame_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(2);
+    frame_header_.frame_type = static_cast<FrameType>(scratch);
+    current_frame_->set_frame_type(frame_header_.frame_type);
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.show_frame = static_cast<bool>(scratch);
+    if (frame_header_.show_frame &&
+        sequence_header_.decoder_model_info_present_flag &&
+        !sequence_header_.timing_info.equal_picture_interval) {
+      OBU_READ_LITERAL_OR_FAIL(
+          sequence_header_.decoder_model_info.frame_presentation_time_length);
+      frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+    }
+    if (frame_header_.show_frame) {
+      frame_header_.showable_frame = (frame_header_.frame_type != kFrameKey);
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.showable_frame = static_cast<bool>(scratch);
+    }
+    current_frame_->set_showable_frame(frame_header_.showable_frame);
+    if (frame_header_.frame_type == kFrameSwitch ||
+        (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+      frame_header_.error_resilient_mode = true;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.error_resilient_mode = static_cast<bool>(scratch);
+    }
+  }
+  if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
+    decoder_state_.reference_valid.fill(false);
+    decoder_state_.reference_order_hint.fill(0);
+    decoder_state_.reference_frame.fill(nullptr);
+  }
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.enable_cdf_update = !static_cast<bool>(scratch);
+  if (sequence_header_.force_screen_content_tools ==
+      kSelectScreenContentTools) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.allow_screen_content_tools = static_cast<bool>(scratch);
+  } else {
+    frame_header_.allow_screen_content_tools =
+        static_cast<bool>(sequence_header_.force_screen_content_tools);
+  }
+  if (frame_header_.allow_screen_content_tools) {
+    if (sequence_header_.force_integer_mv == kSelectIntegerMv) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.force_integer_mv = scratch;
+    } else {
+      frame_header_.force_integer_mv = sequence_header_.force_integer_mv;
+    }
+  } else {
+    frame_header_.force_integer_mv = 0;
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) {
+    frame_header_.force_integer_mv = 1;
+  }
+  if (sequence_header_.frame_id_numbers_present) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+    frame_header_.current_frame_id = static_cast<uint16_t>(scratch);
+    const int previous_frame_id = decoder_state_.current_frame_id;
+    decoder_state_.current_frame_id = frame_header_.current_frame_id;
+    if (frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) {
+      if (previous_frame_id >= 0) {
+        // Section 6.8.2: ..., it is a requirement of bitstream conformance
+        // that all of the following conditions are true:
+        //   * current_frame_id is not equal to PrevFrameID,
+        //   * DiffFrameID is less than 1 << ( idLen - 1 )
+        int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
+        const int id_length_max_value =
+            1 << sequence_header_.frame_id_length_bits;
+        if (diff_frame_id <= 0) {
+          diff_frame_id += id_length_max_value;
+        }
+        if (diff_frame_id >= DivideBy2(id_length_max_value)) {
+          LIBGAV1_DLOG(ERROR,
+                       "current_frame_id (%d) equals or differs too much from "
+                       "previous_frame_id (%d).",
+                       decoder_state_.current_frame_id, previous_frame_id);
+          return false;
+        }
+      }
+      MarkInvalidReferenceFrames();
+    }
+  } else {
+    frame_header_.current_frame_id = 0;
+    decoder_state_.current_frame_id = frame_header_.current_frame_id;
+  }
+  if (frame_header_.frame_type == kFrameSwitch) {
+    frame_header_.frame_size_override_flag = true;
+  } else if (!sequence_header_.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.frame_size_override_flag = static_cast<bool>(scratch);
+  }
+  if (sequence_header_.order_hint_bits > 0) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+    frame_header_.order_hint = scratch;
+  }
+  decoder_state_.order_hint = frame_header_.order_hint;
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      frame_header_.error_resilient_mode) {
+    frame_header_.primary_reference_frame = kPrimaryReferenceNone;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    frame_header_.primary_reference_frame = scratch;
+  }
+  if (sequence_header_.decoder_model_info_present_flag) {
+    OBU_READ_BIT_OR_FAIL;
+    const auto buffer_removal_time_present = static_cast<bool>(scratch);
+    if (buffer_removal_time_present) {
+      for (int i = 0; i < sequence_header_.operating_points; ++i) {
+        if (!sequence_header_.decoder_model_present_for_operating_point[i]) {
+          continue;
+        }
+        const int index = sequence_header_.operating_point_idc[i];
+        if (index == 0 ||
+            (InTemporalLayer(index, obu_headers_.back().temporal_id) &&
+             InSpatialLayer(index, obu_headers_.back().spatial_id))) {
+          OBU_READ_LITERAL_OR_FAIL(
+              sequence_header_.decoder_model_info.buffer_removal_time_length);
+          frame_header_.buffer_removal_time[i] = static_cast<uint32_t>(scratch);
+        }
+      }
+    }
+  }
+  if (frame_header_.frame_type == kFrameSwitch ||
+      (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+    frame_header_.refresh_frame_flags = 0xff;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    frame_header_.refresh_frame_flags = scratch;
+    // Section 6.8.2: If frame_type is equal to INTRA_ONLY_FRAME, it is a
+    // requirement of bitstream conformance that refresh_frame_flags is not
+    // equal to 0xff.
+    if (frame_header_.frame_type == kFrameIntraOnly &&
+        frame_header_.refresh_frame_flags == 0xff) {
+      LIBGAV1_DLOG(ERROR, "Intra only frames cannot have refresh flags 0xFF.");
+      return false;
+    }
+  }
+  if ((!IsIntraFrame(frame_header_.frame_type) ||
+       frame_header_.refresh_frame_flags != 0xff) &&
+      !ParseReferenceOrderHint()) {
+    return false;
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) {
+    if (!ParseFrameSizeAndRenderSize()) return false;
+    if (frame_header_.allow_screen_content_tools &&
+        frame_header_.width == frame_header_.upscaled_width) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.allow_intrabc = static_cast<bool>(scratch);
+    }
+  } else {
+    if (!sequence_header_.enable_order_hint) {
+      frame_header_.frame_refs_short_signaling = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.frame_refs_short_signaling = static_cast<bool>(scratch);
+      if (frame_header_.frame_refs_short_signaling) {
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const int8_t last_frame_idx = scratch;
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const int8_t gold_frame_idx = scratch;
+        if (!SetFrameReferences(last_frame_idx, gold_frame_idx)) {
+          return false;
+        }
+      }
+    }
+    for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+      if (!frame_header_.frame_refs_short_signaling) {
+        OBU_READ_LITERAL_OR_FAIL(3);
+        frame_header_.reference_frame_index[i] = scratch;
+      }
+      const int reference_frame_index = frame_header_.reference_frame_index[i];
+      assert(reference_frame_index >= 0);
+      // Section 6.8.2: It is a requirement of bitstream conformance that
+      // RefValid[ ref_frame_idx[ i ] ] is equal to 1 ...
+      // The remainder of the statement is handled by ParseSequenceHeader().
+      // Note if support for Annex C: Error resilience behavior is added this
+      // check should be omitted per C.5 Decoder consequences of processable
+      // frames.
+      if (!decoder_state_.reference_valid[reference_frame_index]) {
+        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
+                     reference_frame_index);
+        return false;
+      }
+      // Check if the inter frame requests a nonexistent reference, whether or
+      // not frame_refs_short_signaling is used.
+      if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
+        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not a decoded frame.", i,
+                     reference_frame_index);
+        return false;
+      }
+      if (sequence_header_.frame_id_numbers_present) {
+        OBU_READ_LITERAL_OR_FAIL(sequence_header_.delta_frame_id_length_bits);
+        const int delta_frame_id = static_cast<int>(1 + scratch);
+        const int id_length_max_value =
+            1 << sequence_header_.frame_id_length_bits;
+        frame_header_.expected_frame_id[i] =
+            (frame_header_.current_frame_id + id_length_max_value -
+             delta_frame_id) %
+            id_length_max_value;
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // whenever expectedFrameId[ i ] is calculated, the value matches
+        // RefFrameId[ ref_frame_idx[ i ] ] ...
+        //
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // RefValid[ ref_frame_idx[ i ] ] is equal to 1, ...
+        if (frame_header_.expected_frame_id[i] !=
+                decoder_state_.reference_frame_id[reference_frame_index] ||
+            !decoder_state_.reference_valid[reference_frame_index]) {
+          LIBGAV1_DLOG(ERROR,
+                       "Reference buffer %d has a frame id number mismatch.",
+                       reference_frame_index);
+          return false;
+        }
+      }
+    }
+    if (frame_header_.frame_size_override_flag &&
+        !frame_header_.error_resilient_mode) {
+      // Section 5.9.7.
+      for (int index : frame_header_.reference_frame_index) {
+        OBU_READ_BIT_OR_FAIL;
+        frame_header_.found_reference = static_cast<bool>(scratch);
+        if (frame_header_.found_reference) {
+          const RefCountedBuffer* reference_frame =
+              decoder_state_.reference_frame[index].get();
+          // frame_header_.upscaled_width will be set in the
+          // ParseSuperResParametersAndComputeImageSize() call below.
+          frame_header_.width = reference_frame->upscaled_width();
+          frame_header_.height = reference_frame->frame_height();
+          frame_header_.render_width = reference_frame->render_width();
+          frame_header_.render_height = reference_frame->render_height();
+          if (!ParseSuperResParametersAndComputeImageSize()) return false;
+          break;
+        }
+      }
+      if (!frame_header_.found_reference && !ParseFrameSizeAndRenderSize()) {
+        return false;
+      }
+    } else {
+      if (!ParseFrameSizeAndRenderSize()) return false;
+    }
+    if (!ValidateInterFrameSize()) return false;
+    if (frame_header_.force_integer_mv != 0) {
+      frame_header_.allow_high_precision_mv = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.allow_high_precision_mv = static_cast<bool>(scratch);
+    }
+    OBU_READ_BIT_OR_FAIL;
+    const auto is_filter_switchable = static_cast<bool>(scratch);
+    if (is_filter_switchable) {
+      frame_header_.interpolation_filter = kInterpolationFilterSwitchable;
+    } else {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.interpolation_filter =
+          static_cast<InterpolationFilter>(scratch);
+    }
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.is_motion_mode_switchable = static_cast<bool>(scratch);
+    if (frame_header_.error_resilient_mode ||
+        !sequence_header_.enable_ref_frame_mvs) {
+      frame_header_.use_ref_frame_mvs = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.use_ref_frame_mvs = static_cast<bool>(scratch);
+    }
+  }
+  // At this point, we have parsed the frame and render sizes and computed
+  // the image size, whether it's an intra or inter frame. So we can save
+  // the sizes in the current frame now.
+  if (!current_frame_->SetFrameDimensions(frame_header_)) {
+    LIBGAV1_DLOG(ERROR, "Setting current frame dimensions failed.");
+    return false;
+  }
+  if (!IsIntraFrame(frame_header_.frame_type)) {
+    // Initialize the kReferenceFrameIntra type reference frame information to
+    // simplify the frame type validation in motion field projection.
+    // Set the kReferenceFrameIntra type |order_hint_| to
+    // |frame_header_.order_hint|. This guarantees that in SIMD implementations,
+    // the other reference frame information of the kReferenceFrameIntra type
+    // could be correctly initialized using the following loop with
+    // |frame_header_.order_hint| being the |hint|.
+    ReferenceInfo* const reference_info = current_frame_->reference_info();
+    reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint;
+    reference_info->relative_distance_from[kReferenceFrameIntra] = 0;
+    reference_info->relative_distance_to[kReferenceFrameIntra] = 0;
+    reference_info->skip_references[kReferenceFrameIntra] = true;
+    reference_info->projection_divisions[kReferenceFrameIntra] = 0;
+
+    for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) {
+      const auto reference_frame = static_cast<ReferenceFrameType>(i);
+      const uint8_t hint =
+          decoder_state_.reference_order_hint
+              [frame_header_.reference_frame_index[i - kReferenceFrameLast]];
+      reference_info->order_hint[reference_frame] = hint;
+      const int relative_distance_from =
+          GetRelativeDistance(hint, frame_header_.order_hint,
+                              sequence_header_.order_hint_shift_bits);
+      const int relative_distance_to =
+          GetRelativeDistance(frame_header_.order_hint, hint,
+                              sequence_header_.order_hint_shift_bits);
+      reference_info->relative_distance_from[reference_frame] =
+          relative_distance_from;
+      reference_info->relative_distance_to[reference_frame] =
+          relative_distance_to;
+      reference_info->skip_references[reference_frame] =
+          relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+      reference_info->projection_divisions[reference_frame] =
+          reference_info->skip_references[reference_frame]
+              ? 0
+              : kProjectionMvDivisionLookup[relative_distance_to];
+      decoder_state_.reference_frame_sign_bias[reference_frame] =
+          relative_distance_from > 0;
+    }
+  }
+  if (frame_header_.enable_cdf_update &&
+      !sequence_header_.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.enable_frame_end_update_cdf = !static_cast<bool>(scratch);
+  } else {
+    frame_header_.enable_frame_end_update_cdf = false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseFrameHeader() {
+  // Section 6.8.1: It is a requirement of bitstream conformance that a
+  // sequence header OBU has been received before a frame header OBU.
+  if (!has_sequence_header_) return false;
+  if (!ParseFrameParameters()) return false;
+  if (frame_header_.show_existing_frame) return true;
+  assert(!obu_headers_.empty());
+  current_frame_->set_spatial_id(obu_headers_.back().spatial_id);
+  current_frame_->set_temporal_id(obu_headers_.back().temporal_id);
+  bool status = ParseTileInfoSyntax() && ParseQuantizerParameters() &&
+                ParseSegmentationParameters();
+  if (!status) return false;
+  current_frame_->SetSegmentationParameters(frame_header_.segmentation);
+  status =
+      ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters();
+  if (!status) return false;
+  ComputeSegmentLosslessAndQIndex();
+  // Section 6.8.2: It is a requirement of bitstream conformance that
+  // delta_q_present is equal to 0 when CodedLossless is equal to 1.
+  if (frame_header_.coded_lossless && frame_header_.delta_q.present) {
+    return false;
+  }
+  status = ParseLoopFilterParameters();
+  if (!status) return false;
+  current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter);
+  status = ParseCdefParameters() && ParseLoopRestorationParameters() &&
+           ParseTxModeSyntax() && ParseFrameReferenceModeSyntax() &&
+           ParseSkipModeParameters() && ReadAllowWarpedMotion();
+  if (!status) return false;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.reduced_tx_set = static_cast<bool>(scratch);
+  status = ParseGlobalMotionParameters();
+  if (!status) return false;
+  current_frame_->SetGlobalMotions(frame_header_.global_motion);
+  status = ParseFilmGrainParameters();
+  if (!status) return false;
+  if (sequence_header_.film_grain_params_present) {
+    current_frame_->set_film_grain_params(frame_header_.film_grain_params);
+  }
+  return true;
+}
+
+bool ObuParser::ParsePadding(const uint8_t* data, size_t size) {
+  // The spec allows a padding OBU to be header-only (i.e., |size| = 0). So
+  // check trailing bits only if |size| > 0.
+  if (size == 0) return true;
+  // The payload of a padding OBU is byte aligned. Therefore the first
+  // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
+  const int i = GetLastNonzeroByteIndex(data, size);
+  if (i < 0) {
+    LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+    return false;
+  }
+  if (data[i] != 0x80) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "The last nonzero byte of the payload data is 0x%x, should be 0x80.",
+        data[i]);
+    return false;
+  }
+  // Skip all bits before the trailing bit.
+  bit_reader_->SkipBytes(i);
+  return true;
+}
+
+bool ObuParser::ParseMetadataScalability() {
+  int64_t scratch;
+  // scalability_mode_idc
+  OBU_READ_LITERAL_OR_FAIL(8);
+  const auto scalability_mode_idc = static_cast<int>(scratch);
+  if (scalability_mode_idc == kScalabilitySS) {
+    // Parse scalability_structure().
+    // spatial_layers_cnt_minus_1
+    OBU_READ_LITERAL_OR_FAIL(2);
+    const auto spatial_layers_count = static_cast<int>(scratch) + 1;
+    // spatial_layer_dimensions_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto spatial_layer_dimensions_present_flag =
+        static_cast<bool>(scratch);
+    // spatial_layer_description_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto spatial_layer_description_present_flag =
+        static_cast<bool>(scratch);
+    // temporal_group_description_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto temporal_group_description_present_flag =
+        static_cast<bool>(scratch);
+    // scalability_structure_reserved_3bits
+    OBU_READ_LITERAL_OR_FAIL(3);
+    if (scratch != 0) {
+      LIBGAV1_DLOG(WARNING,
+                   "scalability_structure_reserved_3bits is not zero.");
+    }
+    if (spatial_layer_dimensions_present_flag) {
+      for (int i = 0; i < spatial_layers_count; ++i) {
+        // spatial_layer_max_width[i]
+        OBU_READ_LITERAL_OR_FAIL(16);
+        // spatial_layer_max_height[i]
+        OBU_READ_LITERAL_OR_FAIL(16);
+      }
+    }
+    if (spatial_layer_description_present_flag) {
+      for (int i = 0; i < spatial_layers_count; ++i) {
+        // spatial_layer_ref_id[i]
+        OBU_READ_LITERAL_OR_FAIL(8);
+      }
+    }
+    if (temporal_group_description_present_flag) {
+      // temporal_group_size
+      OBU_READ_LITERAL_OR_FAIL(8);
+      const auto temporal_group_size = static_cast<int>(scratch);
+      for (int i = 0; i < temporal_group_size; ++i) {
+        // temporal_group_temporal_id[i]
+        OBU_READ_LITERAL_OR_FAIL(3);
+        // temporal_group_temporal_switching_up_point_flag[i]
+        OBU_READ_BIT_OR_FAIL;
+        // temporal_group_spatial_switching_up_point_flag[i]
+        OBU_READ_BIT_OR_FAIL;
+        // temporal_group_ref_cnt[i]
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const auto temporal_group_ref_count = static_cast<int>(scratch);
+        for (int j = 0; j < temporal_group_ref_count; ++j) {
+          // temporal_group_ref_pic_diff[i][j]
+          OBU_READ_LITERAL_OR_FAIL(8);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseMetadataTimecode() {
+  int64_t scratch;
+  // counting_type: should be the same for all pictures in the coded video
+  // sequence. 7..31 are reserved.
+  OBU_READ_LITERAL_OR_FAIL(5);
+  // full_timestamp_flag
+  OBU_READ_BIT_OR_FAIL;
+  const auto full_timestamp_flag = static_cast<bool>(scratch);
+  // discontinuity_flag
+  OBU_READ_BIT_OR_FAIL;
+  // cnt_dropped_flag
+  OBU_READ_BIT_OR_FAIL;
+  // n_frames
+  OBU_READ_LITERAL_OR_FAIL(9);
+  if (full_timestamp_flag) {
+    // seconds_value
+    OBU_READ_LITERAL_OR_FAIL(6);
+    const auto seconds_value = static_cast<int>(scratch);
+    if (seconds_value > 59) {
+      LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+      return false;
+    }
+    // minutes_value
+    OBU_READ_LITERAL_OR_FAIL(6);
+    const auto minutes_value = static_cast<int>(scratch);
+    if (minutes_value > 59) {
+      LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+      return false;
+    }
+    // hours_value
+    OBU_READ_LITERAL_OR_FAIL(5);
+    const auto hours_value = static_cast<int>(scratch);
+    if (hours_value > 23) {
+      LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+      return false;
+    }
+  } else {
+    // seconds_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto seconds_flag = static_cast<bool>(scratch);
+    if (seconds_flag) {
+      // seconds_value
+      OBU_READ_LITERAL_OR_FAIL(6);
+      const auto seconds_value = static_cast<int>(scratch);
+      if (seconds_value > 59) {
+        LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+        return false;
+      }
+      // minutes_flag
+      OBU_READ_BIT_OR_FAIL;
+      const auto minutes_flag = static_cast<bool>(scratch);
+      if (minutes_flag) {
+        // minutes_value
+        OBU_READ_LITERAL_OR_FAIL(6);
+        const auto minutes_value = static_cast<int>(scratch);
+        if (minutes_value > 59) {
+          LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+          return false;
+        }
+        // hours_flag
+        OBU_READ_BIT_OR_FAIL;
+        const auto hours_flag = static_cast<bool>(scratch);
+        if (hours_flag) {
+          // hours_value
+          OBU_READ_LITERAL_OR_FAIL(5);
+          const auto hours_value = static_cast<int>(scratch);
+          if (hours_value > 23) {
+            LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+            return false;
+          }
+        }
+      }
+    }
+  }
+  // time_offset_length: should be the same for all pictures in the coded
+  // video sequence.
+  OBU_READ_LITERAL_OR_FAIL(5);
+  const auto time_offset_length = static_cast<int>(scratch);
+  if (time_offset_length > 0) {
+    // time_offset_value
+    OBU_READ_LITERAL_OR_FAIL(time_offset_length);
+  }
+  // Compute clockTimestamp. Section 6.7.7:
+  //   When timing_info_present_flag is equal to 1 and discontinuity_flag is
+  //   equal to 0, the value of clockTimestamp shall be greater than or equal
+  //   to the value of clockTimestamp for the previous set of clock timestamp
+  //   syntax elements in output order.
+  return true;
+}
+
+bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
+  const size_t start_offset = bit_reader_->byte_offset();
+  size_t metadata_type;
+  if (!bit_reader_->ReadUnsignedLeb128(&metadata_type)) {
+    LIBGAV1_DLOG(ERROR, "Could not read metadata_type.");
+    return false;
+  }
+  const size_t metadata_type_size = bit_reader_->byte_offset() - start_offset;
+  if (size < metadata_type_size) {
+    LIBGAV1_DLOG(
+        ERROR, "metadata_type is longer than metadata OBU payload %zu vs %zu.",
+        metadata_type_size, size);
+    return false;
+  }
+  data += metadata_type_size;
+  size -= metadata_type_size;
+  int64_t scratch;
+  switch (metadata_type) {
+    case kMetadataTypeHdrContentLightLevel:
+      OBU_READ_LITERAL_OR_FAIL(16);
+      metadata_.max_cll = scratch;
+      OBU_READ_LITERAL_OR_FAIL(16);
+      metadata_.max_fall = scratch;
+      break;
+    case kMetadataTypeHdrMasteringDisplayColorVolume:
+      for (int i = 0; i < 3; ++i) {
+        OBU_READ_LITERAL_OR_FAIL(16);
+        metadata_.primary_chromaticity_x[i] = scratch;
+        OBU_READ_LITERAL_OR_FAIL(16);
+        metadata_.primary_chromaticity_y[i] = scratch;
+      }
+      OBU_READ_LITERAL_OR_FAIL(16);
+      metadata_.white_point_chromaticity_x = scratch;
+      OBU_READ_LITERAL_OR_FAIL(16);
+      metadata_.white_point_chromaticity_y = scratch;
+      OBU_READ_LITERAL_OR_FAIL(32);
+      metadata_.luminance_max = static_cast<uint32_t>(scratch);
+      OBU_READ_LITERAL_OR_FAIL(32);
+      metadata_.luminance_min = static_cast<uint32_t>(scratch);
+      break;
+    case kMetadataTypeScalability:
+      if (!ParseMetadataScalability()) return false;
+      break;
+    case kMetadataTypeItutT35: {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      metadata_.itu_t_t35_country_code = static_cast<uint8_t>(scratch);
+      ++data;
+      --size;
+      if (metadata_.itu_t_t35_country_code == 0xFF) {
+        OBU_READ_LITERAL_OR_FAIL(8);
+        metadata_.itu_t_t35_country_code_extension_byte =
+            static_cast<uint8_t>(scratch);
+        ++data;
+        --size;
+      }
+      // Read itu_t_t35_payload_bytes. Section 6.7.2 of the spec says:
+      //   itu_t_t35_payload_bytes shall be bytes containing data registered as
+      //   specified in Recommendation ITU-T T.35.
+      // Therefore itu_t_t35_payload_bytes is byte aligned and the first
+      // trailing byte should be 0x80. Since the exact syntax of
+      // itu_t_t35_payload_bytes is not defined in the AV1 spec, identify the
+      // end of itu_t_t35_payload_bytes by searching for the trailing bit.
+      const int i = GetLastNonzeroByteIndex(data, size);
+      if (i < 0) {
+        LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+        return false;
+      }
+      if (data[i] != 0x80) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "itu_t_t35_payload_bytes is not byte aligned. The last nonzero "
+            "byte of the payload data is 0x%x, should be 0x80.",
+            data[i]);
+        return false;
+      }
+      if (i != 0) {
+        // data[0]..data[i - 1] are itu_t_t35_payload_bytes.
+        metadata_.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[i]);
+        if (metadata_.itu_t_t35_payload_bytes == nullptr) {
+          LIBGAV1_DLOG(ERROR, "Allocation of itu_t_t35_payload_bytes failed.");
+          return false;
+        }
+        memcpy(metadata_.itu_t_t35_payload_bytes.get(), data, i);
+        metadata_.itu_t_t35_payload_size = i;
+      }
+      // Skip all bits before the trailing bit.
+      bit_reader_->SkipBytes(i);
+      break;
+    }
+    case kMetadataTypeTimecode:
+      if (!ParseMetadataTimecode()) return false;
+      break;
+    default: {
+      // metadata_type is equal to a value reserved for future use or a user
+      // private value.
+      //
+      // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU
+      // if they do not understand the metadata_type." Find the trailing bit
+      // and skip all bits before the trailing bit.
+      const int i = GetLastNonzeroByteIndex(data, size);
+      if (i >= 0) {
+        // The last 1 bit in the last nonzero byte is the trailing bit. Skip
+        // all bits before the trailing bit.
+        const int n = CountTrailingZeros(data[i]);
+        bit_reader_->SkipBits(i * 8 + 7 - n);
+      }
+      break;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::AddTileBuffers(int start, int end, size_t total_size,
+                               size_t tg_header_size,
+                               size_t bytes_consumed_so_far) {
+  // Validate that the tile group start and end are within the allowed range.
+  if (start != next_tile_group_start_ || start > end ||
+      end >= frame_header_.tile_info.tile_count) {
+    LIBGAV1_DLOG(ERROR,
+                 "Invalid tile group start %d or end %d: expected tile group "
+                 "start %d, tile_count %d.",
+                 start, end, next_tile_group_start_,
+                 frame_header_.tile_info.tile_count);
+    return false;
+  }
+  next_tile_group_start_ = end + 1;
+
+  if (total_size < tg_header_size) {
+    LIBGAV1_DLOG(ERROR, "total_size (%zu) is less than tg_header_size (%zu).)",
+                 total_size, tg_header_size);
+    return false;
+  }
+  size_t bytes_left = total_size - tg_header_size;
+  const uint8_t* data = data_ + bytes_consumed_so_far + tg_header_size;
+  for (int tile_number = start; tile_number <= end; ++tile_number) {
+    size_t tile_size = 0;
+    if (tile_number != end) {
+      RawBitReader bit_reader(data, bytes_left);
+      if (!bit_reader.ReadLittleEndian(frame_header_.tile_info.tile_size_bytes,
+                                       &tile_size)) {
+        LIBGAV1_DLOG(ERROR, "Could not read tile size for tile #%d",
+                     tile_number);
+        return false;
+      }
+      ++tile_size;
+      data += frame_header_.tile_info.tile_size_bytes;
+      bytes_left -= frame_header_.tile_info.tile_size_bytes;
+      if (tile_size > bytes_left) {
+        LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+                     tile_number);
+        return false;
+      }
+    } else {
+      tile_size = bytes_left;
+      if (tile_size == 0) {
+        LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+                     tile_number);
+        return false;
+      }
+    }
+    // The memory for this has been allocated in ParseTileInfoSyntax(). So it is
+    // safe to use push_back_unchecked here.
+    tile_buffers_.push_back_unchecked({data, tile_size});
+    data += tile_size;
+    bytes_left -= tile_size;
+  }
+  bit_reader_->SkipBytes(total_size - tg_header_size);
+  return true;
+}
+
+bool ObuParser::ParseTileGroup(size_t size, size_t bytes_consumed_so_far) {
+  const TileInfo* const tile_info = &frame_header_.tile_info;
+  const size_t start_offset = bit_reader_->byte_offset();
+  const int tile_bits =
+      tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+  if (tile_bits == 0) {
+    return AddTileBuffers(0, 0, size, 0, bytes_consumed_so_far);
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  const auto tile_start_and_end_present_flag = static_cast<bool>(scratch);
+  if (!tile_start_and_end_present_flag) {
+    if (!bit_reader_->AlignToNextByte()) {
+      LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+      return false;
+    }
+    return AddTileBuffers(0, tile_info->tile_count - 1, size, 1,
+                          bytes_consumed_so_far);
+  }
+  if (obu_headers_.back().type == kObuFrame) {
+    // 6.10.1: If obu_type is equal to OBU_FRAME, it is a requirement of
+    // bitstream conformance that the value of tile_start_and_end_present_flag
+    // is equal to 0.
+    LIBGAV1_DLOG(ERROR,
+                 "tile_start_and_end_present_flag must be 0 in Frame OBU");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(tile_bits);
+  const int start = static_cast<int>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(tile_bits);
+  const int end = static_cast<int>(scratch);
+  if (!bit_reader_->AlignToNextByte()) {
+    LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+    return false;
+  }
+  const size_t tg_header_size = bit_reader_->byte_offset() - start_offset;
+  return AddTileBuffers(start, end, size, tg_header_size,
+                        bytes_consumed_so_far);
+}
+
+bool ObuParser::ParseHeader() {
+  ObuHeader obu_header;
+  int64_t scratch = bit_reader_->ReadBit();
+  if (scratch != 0) {
+    LIBGAV1_DLOG(ERROR, "forbidden_bit is not zero.");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(4);
+  obu_header.type = static_cast<libgav1::ObuType>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  const auto extension_flag = static_cast<bool>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  obu_header.has_size_field = static_cast<bool>(scratch);
+  OBU_READ_BIT_OR_FAIL;  // reserved.
+  if (scratch != 0) {
+    LIBGAV1_DLOG(WARNING, "obu_reserved_1bit is not zero.");
+  }
+  obu_header.has_extension = extension_flag;
+  if (extension_flag) {
+    if (extension_disallowed_) {
+      LIBGAV1_DLOG(ERROR,
+                   "OperatingPointIdc is 0, but obu_extension_flag is 1.");
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(3);
+    obu_header.temporal_id = scratch;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    obu_header.spatial_id = scratch;
+    OBU_READ_LITERAL_OR_FAIL(3);  // reserved.
+    if (scratch != 0) {
+      LIBGAV1_DLOG(WARNING, "extension_header_reserved_3bits is not zero.");
+    }
+  } else {
+    obu_header.temporal_id = 0;
+    obu_header.spatial_id = 0;
+  }
+  return obu_headers_.push_back(obu_header);
+}
+
+#undef OBU_READ_UVLC_OR_FAIL
+#undef OBU_READ_LITERAL_OR_FAIL
+#undef OBU_READ_BIT_OR_FAIL
+#undef OBU_PARSER_FAIL
+#undef OBU_LOG_AND_RETURN_FALSE
+
+bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) {
+  bit_reader_.reset(new (std::nothrow) RawBitReader(data, size));
+  return bit_reader_ != nullptr;
+}
+
+bool ObuParser::HasData() const { return size_ > 0; }
+
+StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
+  if (data_ == nullptr || size_ == 0) return kStatusInvalidArgument;
+
+  assert(current_frame_ == nullptr);
+  // This is used to release any references held in case of parsing failure.
+  RefCountedBufferPtrCleanup current_frame_cleanup(&current_frame_);
+
+  const uint8_t* data = data_;
+  size_t size = size_;
+
+  // Clear everything except the sequence header.
+  obu_headers_.clear();
+  frame_header_ = {};
+  metadata_ = {};
+  tile_buffers_.clear();
+  next_tile_group_start_ = 0;
+
+  bool parsed_one_full_frame = false;
+  bool seen_frame_header = false;
+  const uint8_t* frame_header = nullptr;
+  size_t frame_header_size_in_bits = 0;
+  while (size > 0 && !parsed_one_full_frame) {
+    if (!InitBitReader(data, size)) {
+      LIBGAV1_DLOG(ERROR, "Failed to initialize bit reader.");
+      return kStatusOutOfMemory;
+    }
+    if (!ParseHeader()) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU Header.");
+      return kStatusBitstreamError;
+    }
+    const ObuHeader& obu_header = obu_headers_.back();
+    if (!obu_header.has_size_field) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "has_size_field is zero. libgav1 does not support such streams.");
+      return kStatusUnimplemented;
+    }
+    const size_t obu_header_size = bit_reader_->byte_offset();
+    size_t obu_size;
+    if (!bit_reader_->ReadUnsignedLeb128(&obu_size)) {
+      LIBGAV1_DLOG(ERROR, "Could not read OBU size.");
+      return kStatusBitstreamError;
+    }
+    const size_t obu_length_size = bit_reader_->byte_offset() - obu_header_size;
+    if (size - bit_reader_->byte_offset() < obu_size) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits left to parse OBU %zu vs %zu.",
+                   size - bit_reader_->bit_offset(), obu_size);
+      return kStatusBitstreamError;
+    }
+
+    const ObuType obu_type = obu_header.type;
+    if (obu_type != kObuSequenceHeader && obu_type != kObuTemporalDelimiter &&
+        has_sequence_header_ &&
+        sequence_header_.operating_point_idc[operating_point_] != 0 &&
+        obu_header.has_extension &&
+        (!InTemporalLayer(
+             sequence_header_.operating_point_idc[operating_point_],
+             obu_header.temporal_id) ||
+         !InSpatialLayer(sequence_header_.operating_point_idc[operating_point_],
+                         obu_header.spatial_id))) {
+      obu_headers_.pop_back();
+      bit_reader_->SkipBytes(obu_size);
+      data += bit_reader_->byte_offset();
+      size -= bit_reader_->byte_offset();
+      continue;
+    }
+
+    const size_t obu_start_position = bit_reader_->bit_offset();
+    // The bit_reader_ is byte aligned after reading obu_header and obu_size.
+    // Therefore the byte offset can be computed as obu_start_position >> 3
+    // below.
+    assert((obu_start_position & 7) == 0);
+    bool obu_skipped = false;
+    switch (obu_type) {
+      case kObuTemporalDelimiter:
+        break;
+      case kObuSequenceHeader:
+        if (!ParseSequenceHeader(seen_frame_header)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse SequenceHeader OBU.");
+          return kStatusBitstreamError;
+        }
+        if (sequence_header_.color_config.bitdepth > LIBGAV1_MAX_BITDEPTH) {
+          LIBGAV1_DLOG(
+              ERROR,
+              "Bitdepth %d is not supported. The maximum bitdepth is %d.",
+              sequence_header_.color_config.bitdepth, LIBGAV1_MAX_BITDEPTH);
+          return kStatusUnimplemented;
+        }
+        break;
+      case kObuFrameHeader:
+        if (seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Frame header found but frame header was already seen.");
+          return kStatusBitstreamError;
+        }
+        if (!ParseFrameHeader()) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader OBU.");
+          return kStatusBitstreamError;
+        }
+        frame_header = &data[obu_start_position >> 3];
+        frame_header_size_in_bits =
+            bit_reader_->bit_offset() - obu_start_position;
+        seen_frame_header = true;
+        parsed_one_full_frame = frame_header_.show_existing_frame;
+        break;
+      case kObuRedundantFrameHeader: {
+        if (!seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Redundant frame header found but frame header was not "
+                       "yet seen.");
+          return kStatusBitstreamError;
+        }
+        const size_t fh_size = (frame_header_size_in_bits + 7) >> 3;
+        if (obu_size < fh_size ||
+            memcmp(frame_header, &data[obu_start_position >> 3], fh_size) !=
+                0) {
+          LIBGAV1_DLOG(ERROR,
+                       "Redundant frame header differs from frame header.");
+          return kStatusBitstreamError;
+        }
+        bit_reader_->SkipBits(frame_header_size_in_bits);
+        break;
+      }
+      case kObuFrame: {
+        const size_t fh_start_offset = bit_reader_->byte_offset();
+        if (seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Frame header found but frame header was already seen.");
+          return kStatusBitstreamError;
+        }
+        if (!ParseFrameHeader()) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader in Frame OBU.");
+          return kStatusBitstreamError;
+        }
+        // Section 6.8.2: If obu_type is equal to OBU_FRAME, it is a
+        // requirement of bitstream conformance that show_existing_frame is
+        // equal to 0.
+        if (frame_header_.show_existing_frame) {
+          LIBGAV1_DLOG(ERROR, "Frame OBU cannot set show_existing_frame to 1.");
+          return kStatusBitstreamError;
+        }
+        if (!bit_reader_->AlignToNextByte()) {
+          LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+          return kStatusBitstreamError;
+        }
+        const size_t fh_size = bit_reader_->byte_offset() - fh_start_offset;
+        if (fh_size >= obu_size) {
+          LIBGAV1_DLOG(ERROR, "Frame header size (%zu) >= obu_size (%zu).",
+                       fh_size, obu_size);
+          return kStatusBitstreamError;
+        }
+        if (!ParseTileGroup(obu_size - fh_size,
+                            size_ - size + bit_reader_->byte_offset())) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup in Frame OBU.");
+          return kStatusBitstreamError;
+        }
+        parsed_one_full_frame = true;
+        break;
+      }
+      case kObuTileGroup:
+        if (!ParseTileGroup(obu_size,
+                            size_ - size + bit_reader_->byte_offset())) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup OBU.");
+          return kStatusBitstreamError;
+        }
+        parsed_one_full_frame =
+            (next_tile_group_start_ == frame_header_.tile_info.tile_count);
+        break;
+      case kObuTileList:
+        LIBGAV1_DLOG(ERROR, "Decoding of tile list OBUs is not supported.");
+        return kStatusUnimplemented;
+      case kObuPadding:
+        if (!ParsePadding(&data[obu_start_position >> 3], obu_size)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse Padding OBU.");
+          return kStatusBitstreamError;
+        }
+        break;
+      case kObuMetadata:
+        if (!ParseMetadata(&data[obu_start_position >> 3], obu_size)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse Metadata OBU.");
+          return kStatusBitstreamError;
+        }
+        break;
+      default:
+        // Skip reserved OBUs. Section 6.2.2: Reserved units are for future use
+        // and shall be ignored by AV1 decoder.
+        bit_reader_->SkipBytes(obu_size);
+        obu_skipped = true;
+        break;
+    }
+    if (obu_size > 0 && !obu_skipped && obu_type != kObuFrame &&
+        obu_type != kObuTileGroup) {
+      const size_t parsed_obu_size_in_bits =
+          bit_reader_->bit_offset() - obu_start_position;
+      if (obu_size * 8 < parsed_obu_size_in_bits) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "Parsed OBU size (%zu bits) is greater than expected OBU size "
+            "(%zu bytes) obu_type: %d.",
+            parsed_obu_size_in_bits, obu_size, obu_type);
+        return kStatusBitstreamError;
+      }
+      if (!bit_reader_->VerifyAndSkipTrailingBits(obu_size * 8 -
+                                                  parsed_obu_size_in_bits)) {
+        LIBGAV1_DLOG(ERROR,
+                     "Error when verifying trailing bits for obu type: %d",
+                     obu_type);
+        return kStatusBitstreamError;
+      }
+    }
+    const size_t bytes_consumed = bit_reader_->byte_offset();
+    const size_t consumed_obu_size =
+        bytes_consumed - obu_length_size - obu_header_size;
+    if (consumed_obu_size != obu_size) {
+      LIBGAV1_DLOG(ERROR,
+                   "OBU size (%zu) and consumed size (%zu) does not match for "
+                   "obu_type: %d.",
+                   obu_size, consumed_obu_size, obu_type);
+      return kStatusBitstreamError;
+    }
+    data += bytes_consumed;
+    size -= bytes_consumed;
+  }
+  if (!parsed_one_full_frame && seen_frame_header) {
+    LIBGAV1_DLOG(ERROR, "The last tile group in the frame was not received.");
+    return kStatusBitstreamError;
+  }
+  data_ = data;
+  size_ = size;
+  *current_frame = std::move(current_frame_);
+  return kStatusOk;
+}
+
+}  // namespace libgav1
diff --git a/src/obu_parser.h b/src/obu_parser.h
new file mode 100644
index 0000000..86d165f
--- /dev/null
+++ b/src/obu_parser.h
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_OBU_PARSER_H_
+#define LIBGAV1_SRC_OBU_PARSER_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/status_code.h"
+#include "src/quantizer.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// structs and enums related to Open Bitstream Units (OBU).
+
+enum {
+  kMinimumMajorBitstreamLevel = 2,
+  kSelectScreenContentTools = 2,
+  kSelectIntegerMv = 2,
+  kLoopRestorationTileSizeMax = 256,
+  kGlobalMotionAlphaBits = 12,
+  kGlobalMotionTranslationBits = 12,
+  kGlobalMotionTranslationOnlyBits = 9,
+  kGlobalMotionAlphaPrecisionBits = 15,
+  kGlobalMotionTranslationPrecisionBits = 6,
+  kGlobalMotionTranslationOnlyPrecisionBits = 3,
+  kMaxTileWidth = 4096,
+  kMaxTileArea = 4096 * 2304,
+  kPrimaryReferenceNone = 7,
+  // A special value of the scalability_mode_idc syntax element that indicates
+  // the picture prediction structure is specified in scalability_structure().
+  kScalabilitySS = 14
+};  // anonymous enum
+
+struct ObuHeader {
+  ObuType type;
+  bool has_extension;
+  bool has_size_field;
+  int8_t temporal_id;
+  int8_t spatial_id;
+};
+
+enum BitstreamProfile : uint8_t {
+  kProfile0,
+  kProfile1,
+  kProfile2,
+  kMaxProfiles
+};
+
+// In the bitstream the level is encoded in five bits: the first three bits
+// encode |major| - 2 and the last two bits encode |minor|.
+//
+// If the mapped level (major.minor) is in the tables in Annex A.3, there are
+// bitstream conformance requirements on the maximum or minimum values of
+// several variables. The encoded value of 31 (which corresponds to the mapped
+// level 9.3) is the "maximum parameters" level and imposes no level-based
+// constraints on the bitstream.
+struct BitStreamLevel {
+  uint8_t major;  // Range: 2-9.
+  uint8_t minor;  // Range: 0-3.
+};
+
+struct ColorConfig {
+  int8_t bitdepth;
+  bool is_monochrome;
+  ColorPrimary color_primary;
+  TransferCharacteristics transfer_characteristics;
+  MatrixCoefficients matrix_coefficients;
+  // A binary value (0 or 1) that is associated with the VideoFullRangeFlag
+  // variable specified in ISO/IEC 23091-4/ITUT H.273.
+  // * 0: the studio swing representation.
+  // * 1: the full swing representation.
+  ColorRange color_range;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+  ChromaSamplePosition chroma_sample_position;
+  bool separate_uv_delta_q;
+};
+
+struct TimingInfo {
+  uint32_t num_units_in_tick;
+  uint32_t time_scale;
+  bool equal_picture_interval;
+  uint32_t num_ticks_per_picture;
+};
+
+struct DecoderModelInfo {
+  uint8_t encoder_decoder_buffer_delay_length;
+  uint32_t num_units_in_decoding_tick;
+  uint8_t buffer_removal_time_length;
+  uint8_t frame_presentation_time_length;
+};
+
+struct OperatingParameters {
+  uint32_t decoder_buffer_delay[kMaxOperatingPoints];
+  uint32_t encoder_buffer_delay[kMaxOperatingPoints];
+  bool low_delay_mode_flag[kMaxOperatingPoints];
+};
+
+struct ObuSequenceHeader {
+  // Section 7.5:
+  //   Within a particular coded video sequence, the contents of
+  //   sequence_header_obu must be bit-identical each time the sequence header
+  //   appears except for the contents of operating_parameters_info. A new
+  //   coded video sequence is required if the sequence header parameters
+  //   change.
+  //
+  // IMPORTANT: ParametersChanged() is implemented with a memcmp() call. For
+  // this to work, this object and the |old| object must be initialized with
+  // an empty brace-enclosed list, which initializes any padding to zero bits.
+  // See https://en.cppreference.com/w/cpp/language/zero_initialization.
+  bool ParametersChanged(const ObuSequenceHeader& old) const;
+
+  BitstreamProfile profile;
+  bool still_picture;
+  bool reduced_still_picture_header;
+  int operating_points;
+  int operating_point_idc[kMaxOperatingPoints];
+  BitStreamLevel level[kMaxOperatingPoints];
+  int8_t tier[kMaxOperatingPoints];
+  int8_t frame_width_bits;
+  int8_t frame_height_bits;
+  int32_t max_frame_width;
+  int32_t max_frame_height;
+  bool frame_id_numbers_present;
+  int8_t frame_id_length_bits;
+  int8_t delta_frame_id_length_bits;
+  bool use_128x128_superblock;
+  bool enable_filter_intra;
+  bool enable_intra_edge_filter;
+  bool enable_interintra_compound;
+  bool enable_masked_compound;
+  bool enable_warped_motion;
+  bool enable_dual_filter;
+  bool enable_order_hint;
+  // If enable_order_hint is true, order_hint_bits is in the range [1, 8].
+  // If enable_order_hint is false, order_hint_bits is 0.
+  int8_t order_hint_bits;
+  // order_hint_shift_bits equals (32 - order_hint_bits) % 32.
+  // This is used frequently in GetRelativeDistance().
+  uint8_t order_hint_shift_bits;
+  bool enable_jnt_comp;
+  bool enable_ref_frame_mvs;
+  bool choose_screen_content_tools;
+  int8_t force_screen_content_tools;
+  bool choose_integer_mv;
+  int8_t force_integer_mv;
+  bool enable_superres;
+  bool enable_cdef;
+  bool enable_restoration;
+  ColorConfig color_config;
+  bool timing_info_present_flag;
+  TimingInfo timing_info;
+  bool decoder_model_info_present_flag;
+  DecoderModelInfo decoder_model_info;
+  bool decoder_model_present_for_operating_point[kMaxOperatingPoints];
+  bool initial_display_delay_present_flag;
+  uint8_t initial_display_delay[kMaxOperatingPoints];
+  bool film_grain_params_present;
+
+  // IMPORTANT: the operating_parameters member must be at the end of the
+  // struct so that ParametersChanged() can be implemented with a memcmp()
+  // call.
+  OperatingParameters operating_parameters;
+};
+// Verify it is safe to use offsetof with ObuSequenceHeader and to use memcmp
+// to compare two ObuSequenceHeader objects.
+static_assert(std::is_standard_layout<ObuSequenceHeader>::value, "");
+// Verify operating_parameters is the last member of ObuSequenceHeader. The
+// second assertion assumes that ObuSequenceHeader has no padding after the
+// operating_parameters field. The first assertion is a sufficient condition
+// for ObuSequenceHeader to have no padding after the operating_parameters
+// field.
+static_assert(alignof(ObuSequenceHeader) == alignof(OperatingParameters), "");
+static_assert(sizeof(ObuSequenceHeader) ==
+                  offsetof(ObuSequenceHeader, operating_parameters) +
+                      sizeof(OperatingParameters),
+              "");
+
+struct TileBuffer {
+  const uint8_t* data;
+  size_t size;
+};
+
+enum MetadataType : uint8_t {
+  // 0 is reserved for AOM use.
+  kMetadataTypeHdrContentLightLevel = 1,
+  kMetadataTypeHdrMasteringDisplayColorVolume = 2,
+  kMetadataTypeScalability = 3,
+  kMetadataTypeItutT35 = 4,
+  kMetadataTypeTimecode = 5,
+  // 6-31 are unregistered user private.
+  // 32 and greater are reserved for AOM use.
+};
+
+struct ObuMetadata {
+  // Maximum content light level.
+  uint16_t max_cll;
+  // Maximum frame-average light level.
+  uint16_t max_fall;
+  uint16_t primary_chromaticity_x[3];
+  uint16_t primary_chromaticity_y[3];
+  uint16_t white_point_chromaticity_x;
+  uint16_t white_point_chromaticity_y;
+  uint32_t luminance_max;
+  uint32_t luminance_min;
+  // ITU-T T.35.
+  uint8_t itu_t_t35_country_code;
+  uint8_t itu_t_t35_country_code_extension_byte;  // Valid if
+                                                  // itu_t_t35_country_code is
+                                                  // 0xFF.
+  std::unique_ptr<uint8_t[]> itu_t_t35_payload_bytes;
+  size_t itu_t_t35_payload_size;
+};
+
+class ObuParser : public Allocable {
+ public:
+  ObuParser(const uint8_t* const data, size_t size, int operating_point,
+            BufferPool* const buffer_pool, DecoderState* const decoder_state)
+      : data_(data),
+        size_(size),
+        operating_point_(operating_point),
+        buffer_pool_(buffer_pool),
+        decoder_state_(*decoder_state) {}
+
+  // Not copyable or movable.
+  ObuParser(const ObuParser& rhs) = delete;
+  ObuParser& operator=(const ObuParser& rhs) = delete;
+
+  // Returns true if there is more data that needs to be parsed.
+  bool HasData() const;
+
+  // Parses a sequence of Open Bitstream Units until a decodable frame is found
+  // (or until the end of stream is reached). A decodable frame is considered to
+  // be found when one of the following happens:
+  //   * A kObuFrame is seen.
+  //   * The kObuTileGroup containing the last tile is seen.
+  //   * A kFrameHeader with show_existing_frame = true is seen.
+  //
+  // If the parsing is successful, relevant fields will be populated. The fields
+  // are valid only if the return value is kStatusOk. Returns kStatusOk on
+  // success, an error status otherwise. On success, |current_frame| will be
+  // populated with a valid frame buffer.
+  StatusCode ParseOneFrame(RefCountedBufferPtr* current_frame);
+
+  // Getters. Only valid if ParseOneFrame() completes successfully.
+  const Vector<ObuHeader>& obu_headers() const { return obu_headers_; }
+  const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+  const ObuFrameHeader& frame_header() const { return frame_header_; }
+  const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
+  const ObuMetadata& metadata() const { return metadata_; }
+
+  // Setters.
+  void set_sequence_header(const ObuSequenceHeader& sequence_header) {
+    sequence_header_ = sequence_header;
+    has_sequence_header_ = true;
+  }
+
+  // Moves |tile_buffers_| into |tile_buffers|.
+  void MoveTileBuffers(Vector<TileBuffer>* tile_buffers) {
+    *tile_buffers = std::move(tile_buffers_);
+  }
+
+ private:
+  // Initializes the bit reader. This is a function of its own to make unit
+  // testing of private functions simpler.
+  LIBGAV1_MUST_USE_RESULT bool InitBitReader(const uint8_t* data, size_t size);
+
+  // Parse helper functions.
+  bool ParseHeader();  // 5.3.2 and 5.3.3.
+  bool ParseColorConfig(ObuSequenceHeader* sequence_header);       // 5.5.2.
+  bool ParseTimingInfo(ObuSequenceHeader* sequence_header);        // 5.5.3.
+  bool ParseDecoderModelInfo(ObuSequenceHeader* sequence_header);  // 5.5.4.
+  bool ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+                                int index);          // 5.5.5.
+  bool ParseSequenceHeader(bool seen_frame_header);  // 5.5.1.
+  bool ParseFrameParameters();                       // 5.9.2, 5.9.7 and 5.9.10.
+  void MarkInvalidReferenceFrames();                 // 5.9.4.
+  bool ParseFrameSizeAndRenderSize();                // 5.9.5 and 5.9.6.
+  bool ParseSuperResParametersAndComputeImageSize();  // 5.9.8 and 5.9.9.
+  // Checks the bitstream conformance requirement in Section 6.8.6.
+  bool ValidateInterFrameSize() const;
+  bool ParseReferenceOrderHint();
+  static int FindLatestBackwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindEarliestBackwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindLatestForwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindReferenceWithSmallestOutputOrder(
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints);
+  bool SetFrameReferences(int8_t last_frame_idx,
+                          int8_t gold_frame_idx);  // 7.8.
+  bool ParseLoopFilterParameters();                // 5.9.11.
+  bool ParseDeltaQuantizer(int8_t* delta);         // 5.9.13.
+  bool ParseQuantizerParameters();                 // 5.9.12.
+  bool ParseSegmentationParameters();              // 5.9.14.
+  bool ParseQuantizerIndexDeltaParameters();       // 5.9.17.
+  bool ParseLoopFilterDeltaParameters();           // 5.9.18.
+  void ComputeSegmentLosslessAndQIndex();
+  bool ParseCdefParameters();             // 5.9.19.
+  bool ParseLoopRestorationParameters();  // 5.9.20.
+  bool ParseTxModeSyntax();               // 5.9.21.
+  bool ParseFrameReferenceModeSyntax();   // 5.9.23.
+  // Returns whether skip mode is allowed. When it returns true, it also sets
+  // the frame_header_.skip_mode_frame array.
+  bool IsSkipModeAllowed();
+  bool ParseSkipModeParameters();  // 5.9.22.
+  bool ReadAllowWarpedMotion();
+  bool ParseGlobalParamSyntax(
+      int ref, int index,
+      const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+          prev_global_motions);        // 5.9.25.
+  bool ParseGlobalMotionParameters();  // 5.9.24.
+  bool ParseFilmGrainParameters();     // 5.9.30.
+  bool ParseTileInfoSyntax();          // 5.9.15.
+  bool ParseFrameHeader();             // 5.9.
+  // |data| and |size| specify the payload data of the padding OBU.
+  // NOTE: Although the payload data is available in the bit_reader_ member,
+  // it is also passed to ParsePadding() as function parameters so that
+  // ParsePadding() can find the trailing bit of the OBU and skip over the
+  // payload data as an opaque chunk of data.
+  bool ParsePadding(const uint8_t* data, size_t size);  // 5.7.
+  bool ParseMetadataScalability();                      // 5.8.5 and 5.8.6.
+  bool ParseMetadataTimecode();                         // 5.8.7.
+  // |data| and |size| specify the payload data of the metadata OBU.
+  // NOTE: Although the payload data is available in the bit_reader_ member,
+  // it is also passed to ParseMetadata() as function parameters so that
+  // ParseMetadata() can find the trailing bit of the OBU and either extract
+  // or skip over the payload data as an opaque chunk of data.
+  bool ParseMetadata(const uint8_t* data, size_t size);  // 5.8.
+  // Adds and populates the TileBuffer for each tile in the tile group and
+  // updates |next_tile_group_start_|
+  bool AddTileBuffers(int start, int end, size_t total_size,
+                      size_t tg_header_size, size_t bytes_consumed_so_far);
+  bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far);  // 5.11.1.
+
+  // Parser elements.
+  std::unique_ptr<RawBitReader> bit_reader_;
+  const uint8_t* data_;
+  size_t size_;
+  const int operating_point_;
+
+  // OBU elements. Only valid if ParseOneFrame() completes successfully.
+  Vector<ObuHeader> obu_headers_;
+  ObuSequenceHeader sequence_header_ = {};
+  ObuFrameHeader frame_header_ = {};
+  Vector<TileBuffer> tile_buffers_;
+  ObuMetadata metadata_ = {};
+  // The expected starting tile number of the next Tile Group.
+  int next_tile_group_start_ = 0;
+  // If true, the sequence_header_ field is valid.
+  bool has_sequence_header_ = false;
+  // If true, the obu_extension_flag syntax element in the OBU header must be
+  // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
+  bool extension_disallowed_ = false;
+
+  BufferPool* const buffer_pool_;
+  DecoderState& decoder_state_;
+  // Used by ParseOneFrame() to populate the current frame that is being
+  // decoded. The invariant maintained is that this variable will be nullptr at
+  // the beginning and at the end of each call to ParseOneFrame(). This ensures
+  // that the ObuParser is not holding on to any references to the current
+  // frame once the ParseOneFrame() call is complete.
+  RefCountedBufferPtr current_frame_;
+
+  // For unit testing private functions.
+  friend class ObuParserTest;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_OBU_PARSER_H_
diff --git a/src/post_filter.h b/src/post_filter.h
new file mode 100644
index 0000000..800d51d
--- /dev/null
+++ b/src/post_filter.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_POST_FILTER_H_
+#define LIBGAV1_SRC_POST_FILTER_H_
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// This class applies in-loop filtering for each frame after it is
+// reconstructed. The in-loop filtering contains all post processing filtering
+// for the reconstructed frame, including deblock filter, CDEF, superres,
+// and loop restoration.
+// Historically, for example in libaom, loop filter refers to deblock filter.
+// To avoid name conflicts, we call this class PostFilter (post processing).
+// In-loop post filtering order is:
+// deblock --> CDEF --> super resolution--> loop restoration.
+// When CDEF and super resolution is not used, we can combine deblock
+// and restoration together to only filter frame buffer once.
+class PostFilter {
+ public:
+  // This class does not take ownership of the masks/restoration_info, but it
+  // may change their values.
+  //
+  // The overall flow of data in this class (for both single and multi-threaded
+  // cases) is as follows:
+  //   -> Input: |frame_buffer_|.
+  //   -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and
+  //      |loop_restoration_buffer_|.
+  //   -> Deblocking:
+  //      * Input: |source_buffer_|
+  //      * Output: |source_buffer_|
+  //   -> CDEF:
+  //      * Input: |source_buffer_|
+  //      * Output: |cdef_buffer_|
+  //   -> SuperRes:
+  //      * Input: |cdef_buffer_|
+  //      * Output: |superres_buffer_|
+  //   -> Loop Restoration:
+  //      * Input: |superres_buffer_|
+  //      * Output: |loop_restoration_buffer_|.
+  //   -> Now |frame_buffer_| contains the filtered frame.
+  PostFilter(const ObuFrameHeader& frame_header,
+             const ObuSequenceHeader& sequence_header,
+             FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer,
+             const dsp::Dsp* dsp, int do_post_filter_mask);
+
+  // non copyable/movable.
+  PostFilter(const PostFilter&) = delete;
+  PostFilter& operator=(const PostFilter&) = delete;
+  PostFilter(PostFilter&&) = delete;
+  PostFilter& operator=(PostFilter&&) = delete;
+
+  // The overall function that applies all post processing filtering with
+  // multiple threads.
+  // * The filtering order is:
+  //   deblock --> CDEF --> super resolution--> loop restoration.
+  // * The output of each filter is the input for the following filter. A
+  //   special case is that loop restoration needs a few rows of the deblocked
+  //   frame and the entire cdef filtered frame:
+  //   deblock --> CDEF --> super resolution --> loop restoration.
+  //              |                                 ^
+  //              |                                 |
+  //              -----------> super resolution -----
+  // * Any of these filters could be present or absent.
+  // * |frame_buffer_| points to the decoded frame buffer. When
+  //   ApplyFilteringThreaded() is called, |frame_buffer_| is modified by each
+  //   of the filters as described below.
+  // Filter behavior (multi-threaded):
+  // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+  //            If cdef and loop restoration are both on, then 4 rows (as
+  //            specified by |kLoopRestorationBorderRows|) in every 64x64 block
+  //            is copied into |loop_restoration_border_|.
+  // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as
+  //         the input and the output is written into |cdef_buffer_| (which is
+  //         the same as |source_buffer_|).
+  // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and
+  //             |superres_line_buffer_| as the input and the output is written
+  //             into |superres_buffer_| (which is just |cdef_buffer_| with a
+  //             shift to the top).
+  // * Restoration: Near in-place filtering.
+  //                Uses the |superres_buffer_| and |loop_restoration_border_|
+  //                as the input and the output is written into
+  //                |loop_restoration_buffer_| (which is just |superres_buffer_|
+  //                with a shift to the left).
+  void ApplyFilteringThreaded();
+
+  // Does the overall post processing filter for one superblock row starting at
+  // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter
+  // will not be applied.
+  //
+  // Filter behavior (single-threaded):
+  // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+  //            If cdef and loop restoration are both on, then 4 rows (as
+  //            specified by |kLoopRestorationBorderRows|) in every 64x64 block
+  //            is copied into |loop_restoration_border_|.
+  // * Cdef: In-place filtering. The output is written into |cdef_buffer_|
+  //         (which is just |source_buffer_| with a shift to the top-left).
+  // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input
+  //             and the output is written into |superres_buffer_| (which is
+  //             just |cdef_buffer_| with a shift to the top).
+  // * Restoration: Near in-place filtering.
+  //                Uses the |superres_buffer_| and |loop_restoration_border_|
+  //                as the input and the output is written into
+  //                |loop_restoration_buffer_| (which is just |superres_buffer_|
+  //                with a shift to the left or top-left).
+  // Returns the index of the last row whose post processing is complete and can
+  // be used for referencing.
+  int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
+                                        bool do_deblock);
+
+  // Apply deblocking filter in one direction (specified by |loop_filter_type|)
+  // for the superblock row starting at |row4x4_start| for columns starting from
+  // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling)
+  // until the smallest multiple of 16 that is >= |column4x4_end| or until
+  // |frame_header_.columns4x4|, whichever is lower. This function must be
+  // called only if |DoDeblock()| returns true.
+  void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start,
+                          int column4x4_start, int column4x4_end, int sb4x4);
+
+  static bool DoCdef(const ObuFrameHeader& frame_header,
+                     int do_post_filter_mask) {
+    return (frame_header.cdef.bits > 0 ||
+            frame_header.cdef.y_primary_strength[0] > 0 ||
+            frame_header.cdef.y_secondary_strength[0] > 0 ||
+            frame_header.cdef.uv_primary_strength[0] > 0 ||
+            frame_header.cdef.uv_secondary_strength[0] > 0) &&
+           (do_post_filter_mask & 0x02) != 0;
+  }
+  bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); }
+  // If filter levels for Y plane (0 for vertical, 1 for horizontal),
+  // are all zero, deblock filter will not be applied.
+  static bool DoDeblock(const ObuFrameHeader& frame_header,
+                        uint8_t do_post_filter_mask) {
+    return (frame_header.loop_filter.level[0] > 0 ||
+            frame_header.loop_filter.level[1] > 0) &&
+           (do_post_filter_mask & 0x01) != 0;
+  }
+  bool DoDeblock() const {
+    return DoDeblock(frame_header_, do_post_filter_mask_);
+  }
+
+  uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index,
+                                         ReferenceFrameType type,
+                                         int mode_id) const {
+    return deblock_filter_levels_[segment_id][level_index][type][mode_id];
+  }
+  // Computes the deblock filter levels using |delta_lf| and stores them in
+  // |deblock_filter_levels|.
+  void ComputeDeblockFilterLevels(
+      const int8_t delta_lf[kFrameLfCount],
+      uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+                                   [kNumReferenceFrameTypes][2]) const;
+  // Returns true if loop restoration will be performed for the given parameters
+  // and mask.
+  static bool DoRestoration(const LoopRestoration& loop_restoration,
+                            uint8_t do_post_filter_mask, int num_planes) {
+    if (num_planes == kMaxPlanesMonochrome) {
+      return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone &&
+             (do_post_filter_mask & 0x08) != 0;
+    }
+    return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
+            loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
+            loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) &&
+           (do_post_filter_mask & 0x08) != 0;
+  }
+  bool DoRestoration() const {
+    return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_);
+  }
+
+  // Returns a pointer to the unfiltered buffer. This is used by the Tile class
+  // to determine where to write the output of the tile decoding process taking
+  // in-place filtering offsets into consideration.
+  uint8_t* GetUnfilteredBuffer(int plane) { return source_buffer_[plane]; }
+  const YuvBuffer& frame_buffer() const { return frame_buffer_; }
+
+  // Returns true if SuperRes will be performed for the given frame header and
+  // mask.
+  static bool DoSuperRes(const ObuFrameHeader& frame_header,
+                         uint8_t do_post_filter_mask) {
+    return frame_header.width != frame_header.upscaled_width &&
+           (do_post_filter_mask & 0x04) != 0;
+  }
+  bool DoSuperRes() const {
+    return DoSuperRes(frame_header_, do_post_filter_mask_);
+  }
+  LoopRestorationInfo* restoration_info() const { return restoration_info_; }
+  uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
+                           int row, int column) const {
+    return base_buffer + (row >> subsampling_y_[plane]) * stride +
+           ((column >> subsampling_x_[plane]) << pixel_size_log2_);
+  }
+  uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+  uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+  uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+
+  template <typename Pixel>
+  static void ExtendFrame(Pixel* frame_start, int width, int height,
+                          ptrdiff_t stride, int left, int right, int top,
+                          int bottom);
+
+ private:
+  // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member
+  // functions.
+  using DeblockFilter = void (PostFilter::*)(int row4x4_start,
+                                             int column4x4_start);
+  // The lookup table for picking the deblock filter, according to deblock
+  // filter type.
+  const DeblockFilter deblock_filter_func_[2] = {
+      &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter};
+
+  // Functions common to all post filters.
+
+  // Extends the frame by setting the border pixel values to the one from its
+  // closest frame boundary.
+  void ExtendFrameBoundary(uint8_t* frame_start, int width, int height,
+                           ptrdiff_t stride, int left, int right, int top,
+                           int bottom) const;
+  // Extend frame boundary for referencing if the frame will be saved as a
+  // reference frame.
+  void ExtendBordersForReferenceFrame();
+  // Copies the deblocked pixels needed for loop restoration.
+  void CopyDeblockedPixels(Plane plane, int row4x4);
+  // Copies the border for one superblock row. If |for_loop_restoration| is
+  // true, then it assumes that the border extension is being performed for the
+  // input of the loop restoration process. If |for_loop_restoration| is false,
+  // then it assumes that the border extension is being performed for using the
+  // current frame as a reference frame. In this case, |progress_row_| is also
+  // updated.
+  void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+                                      bool for_loop_restoration);
+  // Sets up the |loop_restoration_border_| for loop restoration.
+  // TODO(linfengz): Unify duplicates in the following two functions if
+  // possible.
+  // This is called when there is no CDEF filter. We copy rows from
+  // |superres_buffer_| and do the line extension.
+  void SetupLoopRestorationBorder(int row4x4_start);
+  // This is called when there is CDEF filter. We copy rows from
+  // |source_buffer_|, apply superres and do the line extension.
+  void SetupLoopRestorationBorder(int row4x4_start, int sb4x4);
+  // Returns true if we can perform border extension in loop (i.e.) without
+  // waiting until the entire frame is decoded. If intra_block_copy is true, we
+  // do in-loop border extension only if the upscaled_width is the same as 4 *
+  // columns4x4. Otherwise, we cannot do in loop border extension since those
+  // pixels may be used by intra block copy.
+  bool DoBorderExtensionInLoop() const {
+    return !frame_header_.allow_intrabc ||
+           frame_header_.upscaled_width ==
+               MultiplyBy4(frame_header_.columns4x4);
+  }
+  template <typename Pixel>
+  void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height,
+                 Pixel* dst, ptrdiff_t dst_stride) {
+    assert(height > 0);
+    do {
+      memcpy(dst, src, width * sizeof(Pixel));
+      src += src_stride;
+      dst += dst_stride;
+    } while (--height != 0);
+  }
+
+  // Worker function used for multi-threaded implementation of Deblocking, CDEF
+  // and Loop Restoration.
+  using WorkerFunction = void (PostFilter::*)(std::atomic<int>* row4x4_atomic);
+  // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling
+  // thread and returns once all the jobs are completed.
+  void RunJobs(WorkerFunction worker);
+
+  // Functions for the Deblocking filter.
+
+  static int GetIndex(int row4x4) { return DivideBy4(row4x4); }
+  static int GetShift(int row4x4, int column4x4) {
+    return ((row4x4 & 3) << 4) | column4x4;
+  }
+  int GetDeblockUnitId(int row_unit, int column_unit) const {
+    return row_unit * num_64x64_blocks_per_row_ + column_unit;
+  }
+  bool GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                          uint8_t* level, int* step,
+                                          int* filter_length) const;
+  void GetHorizontalDeblockFilterEdgeInfoUV(int row4x4, int column4x4,
+                                            uint8_t* level_u, uint8_t* level_v,
+                                            int* step,
+                                            int* filter_length) const;
+  bool GetVerticalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                        BlockParameters* const* bp_ptr,
+                                        uint8_t* level, int* step,
+                                        int* filter_length) const;
+  void GetVerticalDeblockFilterEdgeInfoUV(int column4x4,
+                                          BlockParameters* const* bp_ptr,
+                                          uint8_t* level_u, uint8_t* level_v,
+                                          int* step, int* filter_length) const;
+  void HorizontalDeblockFilter(int row4x4_start, int column4x4_start);
+  void VerticalDeblockFilter(int row4x4_start, int column4x4_start);
+  // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct
+  // signature.
+  static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter),
+                             DeblockFilter>::value,
+                "");
+  static_assert(std::is_same<decltype(&PostFilter::VerticalDeblockFilter),
+                             DeblockFilter>::value,
+                "");
+  // Applies deblock filtering for the superblock row starting at |row4x4| with
+  // a height of 4*|sb4x4|.
+  void ApplyDeblockFilterForOneSuperBlockRow(int row4x4, int sb4x4);
+  // Worker function used for multi-threaded deblocking.
+  template <LoopFilterType loop_filter_type>
+  void DeblockFilterWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(
+      std::is_same<
+          decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>),
+          WorkerFunction>::value,
+      "");
+  static_assert(
+      std::is_same<
+          decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>),
+          WorkerFunction>::value,
+      "");
+
+  // Functions for the cdef filter.
+
+  // Copies the deblocked pixels necessary for use by the multi-threaded cdef
+  // implementation into |cdef_border_|.
+  void SetupCdefBorder(int row4x4);
+  // This function prepares the input source block for cdef filtering. The input
+  // source block contains a 12x12 block, with the inner 8x8 as the desired
+  // filter region. It pads the block if the 12x12 block includes out of frame
+  // pixels with a large value. This achieves the required behavior defined in
+  // section 5.11.52 of the spec.
+  template <typename Pixel>
+  void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
+                        int column4x4, uint16_t* cdef_source,
+                        ptrdiff_t cdef_stride, bool y_plane,
+                        const uint8_t border_columns[kMaxPlanes][256],
+                        bool use_border_columns);
+  // Applies cdef for one 64x64 block.
+  template <typename Pixel>
+  void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
+                           int block_height4x4, int row4x4_start,
+                           int column4x4_start,
+                           uint8_t border_columns[2][kMaxPlanes][256],
+                           bool use_border_columns[2][2]);
+  // Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code
+  // duplication.
+  void ApplyCdefForOneSuperBlockRowHelper(
+      uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+      int row4x4, int block_height4x4);
+  // Applies CDEF filtering for the superblock row starting at |row4x4| with a
+  // height of 4*|sb4x4|.
+  void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row);
+  // Worker function used for multi-threaded CDEF.
+  void ApplyCdefWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(std::is_same<decltype(&PostFilter::ApplyCdefWorker),
+                             WorkerFunction>::value,
+                "");
+
+  // Functions for the SuperRes filter.
+
+  // Applies super resolution for the |src| for |rows[plane]| rows of each
+  // plane. If |line_buffer_row| is larger than or equal to 0, one more row will
+  // be processed, the line buffer indicated by |line_buffer_row| will be used
+  // as the source.
+  void ApplySuperRes(
+      const std::array<uint8_t*, kMaxPlanes>& src,
+      const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
+      const std::array<uint8_t*, kMaxPlanes>& dst);  // Section 7.16.
+  // Applies SuperRes for the superblock row starting at |row4x4| with a height
+  // of 4*|sb4x4|.
+  void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
+                                        bool is_last_row);
+  void ApplySuperResThreaded();
+
+  // Functions for the Loop Restoration filter.
+
+  // Notes about Loop Restoration:
+  // (1). Loop restoration processing unit size is default to 64x64.
+  // Only when the remaining filtering area is smaller than 64x64, the
+  // processing unit size is the actual area size.
+  // For U/V plane, it is (64 >> subsampling_x) x (64 >> subsampling_y).
+  // (2). Loop restoration unit size can be 64x64, 128x128, 256x256 for Y
+  // plane. The unit size for chroma can be the same or half, depending on
+  // subsampling. If either subsampling_x or subsampling_y is one, unit size
+  // is halved on both x and y sides.
+  // All loop restoration units have the same size for one plane.
+  // One loop restoration unit could contain multiple processing units.
+  // But they share the same sets of loop restoration parameters.
+  // (3). Loop restoration has a row offset, kRestorationUnitOffset = 8. The
+  // size of first row of loop restoration units and processing units is
+  // shrunk by the offset.
+  // (4). Loop restoration units wrap the bottom and the right of the frame,
+  // if the remaining area is small. The criteria is whether the number of
+  // remaining rows/columns is smaller than half of loop restoration unit
+  // size.
+  // For example, if the frame size is 140x140, loop restoration unit size is
+  // 128x128. The size of the first loop restoration unit is 128x(128-8) =
+  // 128 columns x 120 rows.
+  // Since 140 - 120 < 128/2. The remaining 20 rows will be folded to the loop
+  // restoration unit. Similarly, the remaining 12 columns will also be folded
+  // to current loop restoration unit. So, even frame size is 140x140,
+  // there's only one loop restoration unit. Suppose processing unit is 64x64,
+  // then sizes of the first row of processing units are 64x56, 64x56, 12x56,
+  // respectively. The second row is 64x64, 64x64, 12x64.
+  // The third row is 64x20, 64x20, 12x20.
+
+  // |stride| is shared by |src_buffer| and |dst_buffer|.
+  template <typename Pixel>
+  void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride,
+                                     Plane plane, int plane_height,
+                                     int plane_width, int y, int unit_row,
+                                     int current_process_unit_height,
+                                     int plane_unit_size, Pixel* dst_buffer);
+  // Applies loop restoration for the superblock row starting at |row4x4_start|
+  // with a height of 4*|sb4x4|.
+  template <typename Pixel>
+  void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4);
+  // Helper function that calls the right variant of
+  // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth.
+  void ApplyLoopRestoration(int row4x4_start, int sb4x4);
+  // Worker function used for multithreaded Loop Restoration.
+  void ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(std::is_same<decltype(&PostFilter::ApplyLoopRestorationWorker),
+                             WorkerFunction>::value,
+                "");
+
+  const ObuFrameHeader& frame_header_;
+  const LoopRestoration& loop_restoration_;
+  const dsp::Dsp& dsp_;
+  const int num_64x64_blocks_per_row_;
+  const int upscaled_width_;
+  const int width_;
+  const int height_;
+  const int8_t bitdepth_;
+  const int8_t subsampling_x_[kMaxPlanes];
+  const int8_t subsampling_y_[kMaxPlanes];
+  const int8_t planes_;
+  const int pixel_size_log2_;
+  const uint8_t* const inner_thresh_;
+  const uint8_t* const outer_thresh_;
+  const bool needs_chroma_deblock_;
+  // This stores the deblocking filter levels assuming that the delta is zero.
+  // This will be used by all superblocks whose delta is zero (without having to
+  // recompute them). The dimensions (in order) are: segment_id, level_index
+  // (based on plane and direction), reference_frame and mode_id.
+  uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+                                [kNumReferenceFrameTypes][2];
+  // Stores the SuperRes info for the frame.
+  struct {
+    int upscaled_width;
+    int initial_subpixel_x;
+    int step;
+  } super_res_info_[kMaxPlanes];
+  const Array2D<int16_t>& cdef_index_;
+  const Array2D<TransformSize>& inter_transform_sizes_;
+  LoopRestorationInfo* const restoration_info_;
+  uint8_t* const superres_coefficients_[kNumPlaneTypes];
+  // Line buffer used by multi-threaded ApplySuperRes().
+  // In the multi-threaded case, this buffer will store the last downscaled row
+  // input of each thread to avoid overwrites by the first upscaled row output
+  // of the thread below it.
+  YuvBuffer& superres_line_buffer_;
+  const BlockParametersHolder& block_parameters_;
+  // Frame buffer to hold cdef filtered frame.
+  YuvBuffer cdef_filtered_buffer_;
+  // Input frame buffer.
+  YuvBuffer& frame_buffer_;
+  // A view into |frame_buffer_| that points to the input and output of the
+  // deblocking process.
+  uint8_t* source_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the output of the CDEF filtered
+  // planes (to facilitate in-place CDEF filtering).
+  uint8_t* cdef_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the planes after the SuperRes
+  // filter is applied (to facilitate in-place SuperRes).
+  uint8_t* superres_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the output of the Loop Restored
+  // planes (to facilitate in-place Loop Restoration).
+  uint8_t* loop_restoration_buffer_[kMaxPlanes];
+  YuvBuffer& cdef_border_;
+  // Buffer used to store the border pixels that are necessary for loop
+  // restoration. This buffer will store 4 rows for every 64x64 block (4 rows
+  // for every 32x32 for chroma with subsampling). The indices of the rows that
+  // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of
+  // this buffer are never populated and never used.
+  // This buffer is used only when both of the following conditions are true:
+  //   (1). Loop Restoration is on.
+  //   (2). Cdef is on, or multi-threading is enabled for post filter.
+  YuvBuffer& loop_restoration_border_;
+  const uint8_t do_post_filter_mask_;
+  ThreadPool* const thread_pool_;
+
+  // Tracks the progress of the post filters.
+  int progress_row_ = -1;
+
+  // A block buffer to hold the input that is converted to uint16_t before
+  // cdef filtering. Only used in single threaded case. Y plane is processed
+  // separately. U and V planes are processed together. So it is sufficient to
+  // have this buffer to accommodate 2 planes at a time.
+  uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterSuperResTest;
+
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterHelperFuncTest;
+};
+
+extern template void PostFilter::ExtendFrame<uint8_t>(uint8_t* frame_start,
+                                                      int width, int height,
+                                                      ptrdiff_t stride,
+                                                      int left, int right,
+                                                      int top, int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void PostFilter::ExtendFrame<uint16_t>(uint16_t* frame_start,
+                                                       int width, int height,
+                                                       ptrdiff_t stride,
+                                                       int left, int right,
+                                                       int top, int bottom);
+#endif
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_POST_FILTER_H_
diff --git a/src/post_filter/cdef.cc b/src/post_filter/cdef.cc
new file mode 100644
index 0000000..994f448
--- /dev/null
+++ b/src/post_filter/cdef.cc
@@ -0,0 +1,660 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kStep64x64 = 16;  // =64/4.
+constexpr int kCdefSkip = 8;
+
+constexpr uint8_t kCdefUvDirection[2][2][8] = {
+    {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
+    {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
+
+constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
+
+template <typename Pixel>
+void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
+                    bool is_frame_left, bool is_frame_right,
+                    uint16_t* const dst, const Pixel* left_border = nullptr) {
+  if (sizeof(src[0]) == sizeof(dst[0])) {
+    if (is_frame_left) {
+      Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
+    } else if (left_border == nullptr) {
+      memcpy(dst - kCdefBorder, src - kCdefBorder,
+             kCdefBorder * sizeof(dst[0]));
+    } else {
+      memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
+    }
+    memcpy(dst, src, block_width * sizeof(dst[0]));
+    if (is_frame_right) {
+      Memset(dst + block_width, kCdefLargeValue,
+             unit_width + kCdefBorder - block_width);
+    } else {
+      memcpy(dst + block_width, src + block_width,
+             (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
+    }
+    return;
+  }
+  if (is_frame_left) {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = static_cast<uint16_t>(kCdefLargeValue);
+    }
+  } else if (left_border == nullptr) {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = src[x];
+    }
+  } else {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = left_border[x + kCdefBorder];
+    }
+  }
+  for (int x = 0; x < block_width; ++x) {
+    dst[x] = src[x];
+  }
+  for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
+    dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
+  }
+}
+
+// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
+// |dst|.
+void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
+                int dst_stride, int width, int height, size_t pixel_size) {
+  int y = height;
+  do {
+    memcpy(dst, src, width * pixel_size);
+    src += src_stride;
+    dst += dst_stride;
+  } while (--y != 0);
+}
+
+}  // namespace
+
+void PostFilter::SetupCdefBorder(int row4x4) {
+  assert(row4x4 >= 0);
+  assert(DoCdef());
+  int plane = kPlaneY;
+  do {
+    const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+    const ptrdiff_t dst_stride = cdef_border_.stride(plane);
+    const int row_offset = DivideBy4(row4x4);
+    const int num_pixels = SubsampledValue(
+        MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
+    const int row_width = num_pixels << pixel_size_log2_;
+    const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
+                                             subsampling_y_[plane]);
+    for (int i = 0; i < 4; ++i) {
+      const int row = kCdefBorderRows[subsampling_y_[plane]][i];
+      const int absolute_row =
+          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+      if (absolute_row >= plane_height) break;
+      const uint8_t* src =
+          GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+          row * src_stride;
+      uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
+      memcpy(dst, src, row_width);
+    }
+  } while (++plane < planes_);
+}
+
+template <typename Pixel>
+void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
+                                  int row4x4, int column4x4,
+                                  uint16_t* cdef_source, ptrdiff_t cdef_stride,
+                                  const bool y_plane,
+                                  const uint8_t border_columns[kMaxPlanes][256],
+                                  bool use_border_columns) {
+  assert(y_plane || planes_ == kMaxPlanes);
+  const int max_planes = y_plane ? 1 : kMaxPlanes;
+  const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
+  const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
+  const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
+  const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
+  const int plane_width = SubsampledValue(width_, subsampling_x);
+  const int plane_height = SubsampledValue(height_, subsampling_y);
+  const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
+  const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
+  // unit_width, unit_height are the same as block_width, block_height unless
+  // it reaches the frame boundary, where block_width < 64 or
+  // block_height < 64. unit_width, unit_height guarantee we build blocks on
+  // a multiple of 8.
+  const int unit_width = Align(block_width, 8 >> subsampling_x);
+  const int unit_height = Align(block_height, 8 >> subsampling_y);
+  const bool is_frame_left = column4x4 == 0;
+  const bool is_frame_right = start_x + block_width >= plane_width;
+  const bool is_frame_top = row4x4 == 0;
+  const bool is_frame_bottom = start_y + block_height >= plane_height;
+  const int y_offset = is_frame_top ? 0 : kCdefBorder;
+  const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
+
+  for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
+    uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
+                                           kCdefUnitSizeWithBorders *
+                                           kCdefUnitSizeWithBorders;
+    const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+    const Pixel* src_buffer =
+        reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
+        (start_y - y_offset) * src_stride + start_x;
+    const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
+    const Pixel* cdef_border =
+        (thread_pool_ == nullptr)
+            ? nullptr
+            : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
+                  cdef_border_row_offset * cdef_border_stride + start_x;
+
+    // All the copying code will use negative indices for populating the left
+    // border. So the starting point is set to kCdefBorder.
+    cdef_src += kCdefBorder;
+
+    // Copy the top 2 rows as follows;
+    // If is_frame_top is true, both the rows are set to kCdefLargeValue.
+    // Otherwise:
+    //   If multi-threaded filtering is off, the rows are copied from
+    //   |src_buffer|.
+    //   Otherwise, the rows are copied from |cdef_border|.
+    if (is_frame_top) {
+      for (int y = 0; y < kCdefBorder; ++y) {
+        Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+               unit_width + 2 * kCdefBorder);
+        cdef_src += cdef_stride;
+      }
+    } else {
+      const Pixel* top_border =
+          (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+      const int top_border_stride =
+          (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+      for (int y = 0; y < kCdefBorder; ++y) {
+        CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        top_border += top_border_stride;
+        cdef_src += cdef_stride;
+        // We need to increment |src_buffer| and |cdef_border| in this loop to
+        // set them up for the subsequent loops below.
+        src_buffer += src_stride;
+        cdef_border += cdef_border_stride;
+      }
+    }
+
+    // Copy the body as follows;
+    // If multi-threaded filtering is off or if is_frame_bottom is true, all the
+    // rows are copied from |src_buffer|.
+    // Otherwise, the first |block_height|-kCdefBorder rows are copied from
+    // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
+    int y = block_height;
+    const int y_threshold =
+        (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
+    const Pixel* left_border =
+        (thread_pool_ == nullptr || !use_border_columns)
+            ? nullptr
+            : reinterpret_cast<const Pixel*>(border_columns[plane]);
+    do {
+      CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+                     is_frame_right, cdef_src, left_border);
+      cdef_src += cdef_stride;
+      src_buffer += src_stride;
+      if (left_border != nullptr) left_border += kCdefBorder;
+    } while (--y != y_threshold);
+
+    if (y > 0) {
+      assert(y == kCdefBorder);
+      // |cdef_border| now points to the top 2 rows of the current block. For
+      // the next loop, we need it to point to the bottom 2 rows of the
+      // current block. So increment it by 2 rows.
+      cdef_border += MultiplyBy2(cdef_border_stride);
+      for (int i = 0; i < kCdefBorder; ++i) {
+        CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        cdef_src += cdef_stride;
+        cdef_border += cdef_border_stride;
+      }
+    }
+
+    // Copy the bottom 2 rows as follows;
+    // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
+    // Otherwise:
+    //   If multi-threaded filtering is off, the rows are copied from
+    //   |src_buffer|.
+    //   Otherwise, the rows are copied from |cdef_border|.
+    y = 0;
+    if (is_frame_bottom) {
+      do {
+        Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+               unit_width + 2 * kCdefBorder);
+        cdef_src += cdef_stride;
+      } while (++y < kCdefBorder + unit_height - block_height);
+    } else {
+      const Pixel* bottom_border =
+          (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+      const int bottom_border_stride =
+          (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+      do {
+        CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        bottom_border += bottom_border_stride;
+        cdef_src += cdef_stride;
+      } while (++y < kCdefBorder + unit_height - block_height);
+    }
+  }
+}
+
+template <typename Pixel>
+void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
+                                     const int block_width4x4,
+                                     const int block_height4x4,
+                                     const int row4x4_start,
+                                     const int column4x4_start,
+                                     uint8_t border_columns[2][kMaxPlanes][256],
+                                     bool use_border_columns[2][2]) {
+  // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
+  static constexpr int kStep = 8;
+  static constexpr int kStep4x4 = 2;
+
+  int cdef_buffer_row_base_stride[kMaxPlanes];
+  uint8_t* cdef_buffer_row_base[kMaxPlanes];
+  int src_buffer_row_base_stride[kMaxPlanes];
+  const uint8_t* src_buffer_row_base[kMaxPlanes];
+  const uint16_t* cdef_src_row_base[kMaxPlanes];
+  int cdef_src_row_base_stride[kMaxPlanes];
+  int column_step[kMaxPlanes];
+  assert(planes_ >= 1);
+  int plane = kPlaneY;
+  do {
+    cdef_buffer_row_base[plane] =
+        GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
+    cdef_buffer_row_base_stride[plane] =
+        frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
+                                                 row4x4_start, column4x4_start);
+    src_buffer_row_base_stride[plane] =
+        frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    cdef_src_row_base[plane] =
+        cdef_block +
+        static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
+            kCdefUnitSizeWithBorders +
+        kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+    cdef_src_row_base_stride[plane] =
+        kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
+    column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
+  } while (++plane < planes_);
+
+  // |border_columns| contains two buffers. In each call to this function, we
+  // will use one of them as the "destination" for the current call. And the
+  // other one as the "source" for the current call (which would have been the
+  // "destination" of the previous call). We will use the src_index to populate
+  // the borders which were backed up in the previous call. We will use the
+  // dst_index to populate the borders to be used in the next call.
+  const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
+  const int border_columns_dst_index = border_columns_src_index ^ 1;
+
+  if (index == -1) {
+    if (thread_pool_ == nullptr) {
+      int plane = kPlaneY;
+      do {
+        CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                   MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                   sizeof(Pixel));
+      } while (++plane < planes_);
+    }
+    use_border_columns[border_columns_dst_index][0] = false;
+    use_border_columns[border_columns_dst_index][1] = false;
+    return;
+  }
+
+  const bool is_frame_right =
+      MultiplyBy4(column4x4_start) + MultiplyBy4(block_width4x4) >= width_;
+  if (!is_frame_right && thread_pool_ != nullptr) {
+    // Backup the last 2 columns for use in the next iteration.
+    use_border_columns[border_columns_dst_index][0] = true;
+    const uint8_t* src_line =
+        GetSourceBuffer(kPlaneY, row4x4_start,
+                        column4x4_start + block_width4x4) -
+        kCdefBorder * sizeof(Pixel);
+    CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
+               border_columns[border_columns_dst_index][kPlaneY],
+               kCdefBorder * sizeof(Pixel), kCdefBorder,
+               MultiplyBy4(block_height4x4), sizeof(Pixel));
+  }
+
+  PrepareCdefBlock<Pixel>(
+      block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+      cdef_block, kCdefUnitSizeWithBorders, true,
+      (border_columns != nullptr) ? border_columns[border_columns_src_index]
+                                  : nullptr,
+      use_border_columns[border_columns_src_index][0]);
+
+  // Stored direction used during the u/v pass.  If bit 3 is set, then block is
+  // a skip.
+  uint8_t direction_y[8 * 8];
+  int y_index = 0;
+
+  const uint8_t y_primary_strength =
+      frame_header_.cdef.y_primary_strength[index];
+  const uint8_t y_secondary_strength =
+      frame_header_.cdef.y_secondary_strength[index];
+  // y_strength_index is 0 for both primary and secondary strengths being
+  // non-zero, 1 for primary only, 2 for secondary only. This will be updated
+  // with y_primary_strength after variance is applied.
+  int y_strength_index = static_cast<int>(y_secondary_strength == 0);
+
+  const bool compute_direction_and_variance =
+      (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
+  BlockParameters* const* bp_row0_base =
+      block_parameters_.Address(row4x4_start, column4x4_start);
+  BlockParameters* const* bp_row1_base =
+      bp_row0_base + block_parameters_.columns4x4();
+  const int bp_stride = MultiplyBy2(block_parameters_.columns4x4());
+  int row4x4 = row4x4_start;
+  do {
+    uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
+    const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+    const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
+    BlockParameters* const* bp0 = bp_row0_base;
+    BlockParameters* const* bp1 = bp_row1_base;
+    int column4x4 = column4x4_start;
+    do {
+      const int block_width = kStep;
+      const int block_height = kStep;
+      const int cdef_stride = frame_buffer_.stride(kPlaneY);
+      uint8_t* const cdef_buffer = cdef_buffer_base;
+      const uint16_t* const cdef_src = cdef_src_base;
+      const int src_stride = frame_buffer_.stride(kPlaneY);
+      const uint8_t* const src_buffer = src_buffer_base;
+
+      const bool skip = (*bp0)->skip && (*(bp0 + 1))->skip && (*bp1)->skip &&
+                        (*(bp1 + 1))->skip;
+
+      if (skip) {  // No cdef filtering.
+        direction_y[y_index] = kCdefSkip;
+        if (thread_pool_ == nullptr) {
+          CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                     block_width, block_height, sizeof(Pixel));
+        }
+      } else {
+        // Zero out residual skip flag.
+        direction_y[y_index] = 0;
+
+        int variance = 0;
+        if (compute_direction_and_variance) {
+          if (thread_pool_ == nullptr ||
+              row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
+            dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+                                &variance);
+          } else if (sizeof(Pixel) == 2) {
+            dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
+                                &direction_y[y_index], &variance);
+          } else {
+            // If we are in the last row4x4 for this unit, then the last two
+            // input rows have to come from |cdef_border_|. Since we already
+            // have |cdef_src| populated correctly, use that as the input
+            // for the direction process.
+            uint8_t direction_src[8][8];
+            const uint16_t* cdef_src_line = cdef_src;
+            for (auto& direction_src_line : direction_src) {
+              for (int i = 0; i < 8; ++i) {
+                direction_src_line[i] = cdef_src_line[i];
+              }
+              cdef_src_line += kCdefUnitSizeWithBorders;
+            }
+            dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
+                                &variance);
+          }
+        }
+        const int direction =
+            (y_primary_strength == 0) ? 0 : direction_y[y_index];
+        const int variance_strength =
+            ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) : 0;
+        const uint8_t primary_strength =
+            (variance != 0)
+                ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
+                : 0;
+        if ((primary_strength | y_secondary_strength) == 0) {
+          if (thread_pool_ == nullptr) {
+            CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                       block_width, block_height, sizeof(Pixel));
+          }
+        } else {
+          const int strength_index =
+              y_strength_index | (static_cast<int>(primary_strength == 0) << 1);
+          dsp_.cdef_filters[1][strength_index](
+              cdef_src, kCdefUnitSizeWithBorders, block_height,
+              primary_strength, y_secondary_strength,
+              frame_header_.cdef.damping, direction, cdef_buffer, cdef_stride);
+        }
+      }
+      cdef_buffer_base += column_step[kPlaneY];
+      src_buffer_base += column_step[kPlaneY];
+      cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
+
+      bp0 += kStep4x4;
+      bp1 += kStep4x4;
+      column4x4 += kStep4x4;
+      y_index++;
+    } while (column4x4 < column4x4_start + block_width4x4);
+
+    cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
+    src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+    cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
+    bp_row0_base += bp_stride;
+    bp_row1_base += bp_stride;
+    row4x4 += kStep4x4;
+  } while (row4x4 < row4x4_start + block_height4x4);
+
+  if (planes_ == kMaxPlanesMonochrome) {
+    return;
+  }
+
+  const uint8_t uv_primary_strength =
+      frame_header_.cdef.uv_primary_strength[index];
+  const uint8_t uv_secondary_strength =
+      frame_header_.cdef.uv_secondary_strength[index];
+
+  if ((uv_primary_strength | uv_secondary_strength) == 0) {
+    if (thread_pool_ == nullptr) {
+      for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+        CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                   MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                   sizeof(Pixel));
+      }
+    }
+    use_border_columns[border_columns_dst_index][1] = false;
+    return;
+  }
+
+  if (!is_frame_right && thread_pool_ != nullptr) {
+    use_border_columns[border_columns_dst_index][1] = true;
+    for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+      // Backup the last 2 columns for use in the next iteration.
+      const uint8_t* src_line =
+          GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
+                          column4x4_start + block_width4x4) -
+          kCdefBorder * sizeof(Pixel);
+      CopyPixels(src_line, frame_buffer_.stride(plane),
+                 border_columns[border_columns_dst_index][plane],
+                 kCdefBorder * sizeof(Pixel), kCdefBorder,
+                 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                 sizeof(Pixel));
+    }
+  }
+
+  PrepareCdefBlock<Pixel>(
+      block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+      cdef_block, kCdefUnitSizeWithBorders, false,
+      (border_columns != nullptr) ? border_columns[border_columns_src_index]
+                                  : nullptr,
+      use_border_columns[border_columns_src_index][1]);
+
+  // uv_strength_index is 0 for both primary and secondary strengths being
+  // non-zero, 1 for primary only, 2 for secondary only.
+  const int uv_strength_index =
+      (static_cast<int>(uv_primary_strength == 0) << 1) |
+      static_cast<int>(uv_secondary_strength == 0);
+  for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+    const int8_t subsampling_x = subsampling_x_[plane];
+    const int8_t subsampling_y = subsampling_y_[plane];
+    const int block_width = kStep >> subsampling_x;
+    const int block_height = kStep >> subsampling_y;
+    int row4x4 = row4x4_start;
+
+    y_index = 0;
+    do {
+      uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
+      const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+      const uint16_t* cdef_src_base = cdef_src_row_base[plane];
+      int column4x4 = column4x4_start;
+      do {
+        const int cdef_stride = frame_buffer_.stride(plane);
+        uint8_t* const cdef_buffer = cdef_buffer_base;
+        const int src_stride = frame_buffer_.stride(plane);
+        const uint8_t* const src_buffer = src_buffer_base;
+        const uint16_t* const cdef_src = cdef_src_base;
+        const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
+        int dual_cdef = 0;
+
+        if (skip) {  // No cdef filtering.
+          if (thread_pool_ == nullptr) {
+            CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                       block_width, block_height, sizeof(Pixel));
+          }
+        } else {
+          // Make sure block pair is not out of bounds.
+          if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
+            // Enable dual processing if subsampling_x is 1.
+            dual_cdef = subsampling_x;
+          }
+
+          int direction = (uv_primary_strength == 0)
+                              ? 0
+                              : kCdefUvDirection[subsampling_x][subsampling_y]
+                                                [direction_y[y_index]];
+
+          if (dual_cdef != 0) {
+            if (uv_primary_strength &&
+                direction_y[y_index] != direction_y[y_index + 1]) {
+              // Disable dual processing if the second block of the pair does
+              // not have the same direction.
+              dual_cdef = 0;
+            }
+
+            // Disable dual processing if the second block of the pair is a
+            // skip.
+            if (direction_y[y_index + 1] == kCdefSkip) {
+              dual_cdef = 0;
+            }
+          }
+
+          // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
+          const int width_index = dual_cdef | (subsampling_x ^ 1);
+          dsp_.cdef_filters[width_index][uv_strength_index](
+              cdef_src, kCdefUnitSizeWithBorders, block_height,
+              uv_primary_strength, uv_secondary_strength,
+              frame_header_.cdef.damping - 1, direction, cdef_buffer,
+              cdef_stride);
+        }
+        // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
+        // so adjust the pointers and indexes for 2 blocks.
+        cdef_buffer_base += column_step[plane] << dual_cdef;
+        src_buffer_base += column_step[plane] << dual_cdef;
+        cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
+        column4x4 += kStep4x4 << dual_cdef;
+        y_index += 1 << dual_cdef;
+      } while (column4x4 < column4x4_start + block_width4x4);
+
+      cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
+      src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+      cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
+      row4x4 += kStep4x4;
+    } while (row4x4 < row4x4_start + block_height4x4);
+  }
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
+    uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+    int row4x4, int block_height4x4) {
+  bool use_border_columns[2][2] = {};
+  for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
+       column4x4 += kStep64x64) {
+    const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
+    const int block_width4x4 =
+        std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
+                                    block_height4x4, row4x4, column4x4,
+                                    border_columns, use_border_columns);
+      continue;
+    }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
+                                 block_height4x4, row4x4, column4x4,
+                                 border_columns, use_border_columns);
+  }
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
+                                              bool is_last_row) {
+  assert(row4x4_start >= 0);
+  assert(DoCdef());
+  for (int y = 0; y < sb4x4; y += kStep64x64) {
+    const int row4x4 = row4x4_start + y;
+    if (row4x4 >= frame_header_.rows4x4) return;
+
+    // Apply cdef for the last 8 rows of the previous superblock row.
+    // One exception: If the superblock size is 128x128 and is_last_row is true,
+    // then we simply apply cdef for the entire superblock row without any lag.
+    // In that case, apply cdef for the previous superblock row only during the
+    // first iteration (y == 0).
+    if (row4x4 > 0 && (!is_last_row || y == 0)) {
+      assert(row4x4 >= 16);
+      ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
+    }
+
+    // Apply cdef for the current superblock row. If this is the last superblock
+    // row we apply cdef for all the rows, otherwise we leave out the last 8
+    // rows.
+    const int block_height4x4 =
+        std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+    const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
+    if (height4x4 > 0) {
+      ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
+                                         height4x4);
+    }
+  }
+}
+
+void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+  // Each border_column buffer has to store 64 rows and 2 columns for each
+  // plane. For 10bit, that is 64*2*2 = 256 bytes.
+  alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
+  while ((row4x4 = row4x4_atomic->fetch_add(
+              kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
+    const int block_height4x4 =
+        std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+    ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
+                                       block_height4x4);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/deblock.cc b/src/post_filter/deblock.cc
new file mode 100644
index 0000000..9b5ed0f
--- /dev/null
+++ b/src/post_filter/deblock.cc
@@ -0,0 +1,523 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <atomic>
+
+#include "src/post_filter.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t HevThresh(int level) { return DivideBy16(level); }
+
+// GetLoopFilterSize* functions depend on this exact ordering of the
+// LoopFilterSize enums.
+static_assert(dsp::kLoopFilterSize4 == 0, "");
+static_assert(dsp::kLoopFilterSize6 == 1, "");
+static_assert(dsp::kLoopFilterSize8 == 2, "");
+static_assert(dsp::kLoopFilterSize14 == 3, "");
+
+dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) {
+  // |filter_length| must be a power of 2.
+  assert((filter_length & (filter_length - 1)) == 0);
+  // This code is the branch free equivalent of:
+  //   if (filter_length == 4) return kLoopFilterSize4;
+  //   if (filter_length == 8) return kLoopFilterSize8;
+  //   return kLoopFilterSize14;
+  return static_cast<dsp::LoopFilterSize>(
+      MultiplyBy2(static_cast<int>(filter_length > 4)) +
+      static_cast<int>(filter_length > 8));
+}
+
+constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) {
+  // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4,
+  // otherwise size is kLoopFilterSize6.
+  return static_cast<dsp::LoopFilterSize>(filter_length != 4);
+}
+
+bool NonBlockBorderNeedsFilter(const BlockParameters& bp, int filter_id,
+                               uint8_t* const level) {
+  if (bp.deblock_filter_level[filter_id] == 0 || (bp.skip && bp.is_inter)) {
+    return false;
+  }
+  *level = bp.deblock_filter_level[filter_id];
+  return true;
+}
+
+// 7.14.5.
+void ComputeDeblockFilterLevelsHelper(
+    const ObuFrameHeader& frame_header, int segment_id, int level_index,
+    const int8_t delta_lf[kFrameLfCount],
+    uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
+  const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
+  uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0,
+                        kMaxLoopFilterValue);
+  const auto feature = static_cast<SegmentFeature>(
+      kSegmentFeatureLoopFilterYVertical + level_index);
+  level =
+      Clip3(level + frame_header.segmentation.feature_data[segment_id][feature],
+            0, kMaxLoopFilterValue);
+  if (!frame_header.loop_filter.delta_enabled) {
+    static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
+    memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
+    return;
+  }
+  assert(frame_header.loop_filter.delta_enabled);
+  const int shift = level >> 5;
+  deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
+      level +
+          LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
+                    shift),
+      0, kMaxLoopFilterValue);
+  // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
+  // not have to be populated.
+  for (int reference_frame = kReferenceFrameIntra + 1;
+       reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
+    for (int mode_id = 0; mode_id < 2; ++mode_id) {
+      deblock_filter_levels[reference_frame][mode_id] = Clip3(
+          level +
+              LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
+                            frame_header.loop_filter.mode_deltas[mode_id],
+                        shift),
+          0, kMaxLoopFilterValue);
+    }
+  }
+}
+
+}  // namespace
+
+void PostFilter::ComputeDeblockFilterLevels(
+    const int8_t delta_lf[kFrameLfCount],
+    uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+                                 [kNumReferenceFrameTypes][2]) const {
+  if (!DoDeblock()) return;
+  for (int segment_id = 0;
+       segment_id < (frame_header_.segmentation.enabled ? kMaxSegments : 1);
+       ++segment_id) {
+    int level_index = 0;
+    for (; level_index < 2; ++level_index) {
+      ComputeDeblockFilterLevelsHelper(
+          frame_header_, segment_id, level_index, delta_lf,
+          deblock_filter_levels[segment_id][level_index]);
+    }
+    for (; level_index < kFrameLfCount; ++level_index) {
+      if (frame_header_.loop_filter.level[level_index] != 0) {
+        ComputeDeblockFilterLevelsHelper(
+            frame_header_, segment_id, level_index, delta_lf,
+            deblock_filter_levels[segment_id][level_index]);
+      }
+    }
+  }
+}
+
+bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                                    uint8_t* level, int* step,
+                                                    int* filter_length) const {
+  *step = kTransformHeight[inter_transform_sizes_[row4x4][column4x4]];
+  if (row4x4 == 0) return false;
+
+  const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+  const int row4x4_prev = row4x4 - 1;
+  assert(row4x4_prev >= 0);
+  const BlockParameters* bp_prev =
+      block_parameters_.Find(row4x4_prev, column4x4);
+
+  if (bp == bp_prev) {
+    // Not a border.
+    if (!NonBlockBorderNeedsFilter(*bp, 1, level)) return false;
+  } else {
+    const uint8_t level_this = bp->deblock_filter_level[1];
+    *level = level_this;
+    if (level_this == 0) {
+      const uint8_t level_prev = bp_prev->deblock_filter_level[1];
+      if (level_prev == 0) return false;
+      *level = level_prev;
+    }
+  }
+  const int step_prev =
+      kTransformHeight[inter_transform_sizes_[row4x4_prev][column4x4]];
+  *filter_length = std::min(*step, step_prev);
+  return true;
+}
+
+void PostFilter::GetHorizontalDeblockFilterEdgeInfoUV(
+    int row4x4, int column4x4, uint8_t* level_u, uint8_t* level_v, int* step,
+    int* filter_length) const {
+  const int subsampling_x = subsampling_x_[kPlaneU];
+  const int subsampling_y = subsampling_y_[kPlaneU];
+  row4x4 = GetDeblockPosition(row4x4, subsampling_y);
+  column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+  const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+  *level_u = 0;
+  *level_v = 0;
+  *step = kTransformHeight[bp->uv_transform_size];
+  if (row4x4 == subsampling_y) {
+    return;
+  }
+
+  bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+  bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+  assert(need_filter_u || need_filter_v);
+  const int filter_id_u =
+      kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeHorizontal];
+  const int filter_id_v =
+      kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeHorizontal];
+  const int row4x4_prev = row4x4 - (1 << subsampling_y);
+  assert(row4x4_prev >= 0);
+  const BlockParameters* bp_prev =
+      block_parameters_.Find(row4x4_prev, column4x4);
+
+  if (bp == bp_prev) {
+    // Not a border.
+    const bool skip = bp->skip && bp->is_inter;
+    need_filter_u =
+        need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+    need_filter_v =
+        need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+    if (!need_filter_u && !need_filter_v) return;
+    if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+    if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+    *filter_length = *step;
+    return;
+  }
+
+  // It is a border.
+  if (need_filter_u) {
+    const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+    *level_u = level_u_this;
+    if (level_u_this == 0) {
+      *level_u = bp_prev->deblock_filter_level[filter_id_u];
+    }
+  }
+  if (need_filter_v) {
+    const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+    *level_v = level_v_this;
+    if (level_v_this == 0) {
+      *level_v = bp_prev->deblock_filter_level[filter_id_v];
+    }
+  }
+  const int step_prev = kTransformHeight[bp_prev->uv_transform_size];
+  *filter_length = std::min(*step, step_prev);
+}
+
+bool PostFilter::GetVerticalDeblockFilterEdgeInfo(
+    int row4x4, int column4x4, BlockParameters* const* bp_ptr, uint8_t* level,
+    int* step, int* filter_length) const {
+  const BlockParameters* bp = *bp_ptr;
+  *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]];
+  if (column4x4 == 0) return false;
+
+  const int filter_id = 0;
+  const int column4x4_prev = column4x4 - 1;
+  assert(column4x4_prev >= 0);
+  const BlockParameters* bp_prev = *(bp_ptr - 1);
+  if (bp == bp_prev) {
+    // Not a border.
+    if (!NonBlockBorderNeedsFilter(*bp, filter_id, level)) return false;
+  } else {
+    // It is a border.
+    const uint8_t level_this = bp->deblock_filter_level[filter_id];
+    *level = level_this;
+    if (level_this == 0) {
+      const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+      if (level_prev == 0) return false;
+      *level = level_prev;
+    }
+  }
+  const int step_prev =
+      kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]];
+  *filter_length = std::min(*step, step_prev);
+  return true;
+}
+
+void PostFilter::GetVerticalDeblockFilterEdgeInfoUV(
+    int column4x4, BlockParameters* const* bp_ptr, uint8_t* level_u,
+    uint8_t* level_v, int* step, int* filter_length) const {
+  const int subsampling_x = subsampling_x_[kPlaneU];
+  column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+  const BlockParameters* bp = *bp_ptr;
+  *level_u = 0;
+  *level_v = 0;
+  *step = kTransformWidth[bp->uv_transform_size];
+  if (column4x4 == subsampling_x) {
+    return;
+  }
+
+  bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+  bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+  assert(need_filter_u || need_filter_v);
+  const int filter_id_u =
+      kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeVertical];
+  const int filter_id_v =
+      kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeVertical];
+  const BlockParameters* bp_prev = *(bp_ptr - (ptrdiff_t{1} << subsampling_x));
+
+  if (bp == bp_prev) {
+    // Not a border.
+    const bool skip = bp->skip && bp->is_inter;
+    need_filter_u =
+        need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+    need_filter_v =
+        need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+    if (!need_filter_u && !need_filter_v) return;
+    if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+    if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+    *filter_length = *step;
+    return;
+  }
+
+  // It is a border.
+  if (need_filter_u) {
+    const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+    *level_u = level_u_this;
+    if (level_u_this == 0) {
+      *level_u = bp_prev->deblock_filter_level[filter_id_u];
+    }
+  }
+  if (need_filter_v) {
+    const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+    *level_v = level_v_this;
+    if (level_v_this == 0) {
+      *level_v = bp_prev->deblock_filter_level[filter_id_v];
+    }
+  }
+  const int step_prev = kTransformWidth[bp_prev->uv_transform_size];
+  *filter_length = std::min(*step, step_prev);
+}
+
+void PostFilter::HorizontalDeblockFilter(int row4x4_start,
+                                         int column4x4_start) {
+  const int column_step = 1;
+  const int src_step = 4 << pixel_size_log2_;
+  const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+  uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+  int row_step;
+  uint8_t level;
+  int filter_length;
+
+  for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit &&
+                          MultiplyBy4(column4x4_start + column4x4) < width_;
+       column4x4 += column_step, src += src_step) {
+    uint8_t* src_row = src;
+    for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
+                         MultiplyBy4(row4x4_start + row4x4) < height_;
+         row4x4 += row_step) {
+      const bool need_filter = GetHorizontalDeblockFilterEdgeInfo(
+          row4x4_start + row4x4, column4x4_start + column4x4, &level, &row_step,
+          &filter_length);
+      if (need_filter) {
+        const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+        dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+            src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+            HevThresh(level));
+      }
+      // TODO(chengchen): use shifts instead of multiplication.
+      src_row += row_step * src_stride;
+      row_step = DivideBy4(row_step);
+    }
+  }
+
+  if (needs_chroma_deblock_) {
+    const int8_t subsampling_x = subsampling_x_[kPlaneU];
+    const int8_t subsampling_y = subsampling_y_[kPlaneU];
+    const int column_step = 1 << subsampling_x;
+    const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+    const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+    uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+    uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+    int row_step;
+    uint8_t level_u;
+    uint8_t level_v;
+    int filter_length;
+
+    for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit &&
+                            MultiplyBy4(column4x4_start + column4x4) < width_;
+         column4x4 += column_step, src_u += src_step, src_v += src_step) {
+      uint8_t* src_row_u = src_u;
+      uint8_t* src_row_v = src_v;
+      for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
+                           MultiplyBy4(row4x4_start + row4x4) < height_;
+           row4x4 += row_step) {
+        GetHorizontalDeblockFilterEdgeInfoUV(
+            row4x4_start + row4x4, column4x4_start + column4x4, &level_u,
+            &level_v, &row_step, &filter_length);
+        if (level_u != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+              src_row_u, src_stride_u, outer_thresh_[level_u],
+              inner_thresh_[level_u], HevThresh(level_u));
+        }
+        if (level_v != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+              src_row_v, src_stride_v, outer_thresh_[level_v],
+              inner_thresh_[level_v], HevThresh(level_v));
+        }
+        src_row_u += row_step * src_stride_u;
+        src_row_v += row_step * src_stride_v;
+        row_step = DivideBy4(row_step << subsampling_y);
+      }
+    }
+  }
+}
+
+void PostFilter::VerticalDeblockFilter(int row4x4_start, int column4x4_start) {
+  const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(kPlaneY));
+  const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+  uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+  int column_step;
+  uint8_t level;
+  int filter_length;
+
+  BlockParameters* const* bp_row_base =
+      block_parameters_.Address(row4x4_start, column4x4_start);
+  const int bp_stride = block_parameters_.columns4x4();
+  const int column_step_shift = pixel_size_log2_;
+  for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
+                       MultiplyBy4(row4x4_start + row4x4) < height_;
+       ++row4x4, src += row_stride, bp_row_base += bp_stride) {
+    uint8_t* src_row = src;
+    BlockParameters* const* bp = bp_row_base;
+    for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit &&
+                            MultiplyBy4(column4x4_start + column4x4) < width_;
+         column4x4 += column_step, bp += column_step) {
+      const bool need_filter = GetVerticalDeblockFilterEdgeInfo(
+          row4x4_start + row4x4, column4x4_start + column4x4, bp, &level,
+          &column_step, &filter_length);
+      if (need_filter) {
+        const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+        dsp_.loop_filters[size][kLoopFilterTypeVertical](
+            src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+            HevThresh(level));
+      }
+      src_row += column_step << column_step_shift;
+      column_step = DivideBy4(column_step);
+    }
+  }
+
+  if (needs_chroma_deblock_) {
+    const int8_t subsampling_x = subsampling_x_[kPlaneU];
+    const int8_t subsampling_y = subsampling_y_[kPlaneU];
+    const int row_step = 1 << subsampling_y;
+    uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+    uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+    const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+    const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+    const ptrdiff_t row_stride_u = MultiplyBy4(frame_buffer_.stride(kPlaneU));
+    const ptrdiff_t row_stride_v = MultiplyBy4(frame_buffer_.stride(kPlaneV));
+    const LoopFilterType type = kLoopFilterTypeVertical;
+    int column_step;
+    uint8_t level_u;
+    uint8_t level_v;
+    int filter_length;
+
+    BlockParameters* const* bp_row_base = block_parameters_.Address(
+        GetDeblockPosition(row4x4_start, subsampling_y),
+        GetDeblockPosition(column4x4_start, subsampling_x));
+    const int bp_stride = block_parameters_.columns4x4() << subsampling_y;
+    for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
+                         MultiplyBy4(row4x4_start + row4x4) < height_;
+         row4x4 += row_step, src_u += row_stride_u, src_v += row_stride_v,
+             bp_row_base += bp_stride) {
+      uint8_t* src_row_u = src_u;
+      uint8_t* src_row_v = src_v;
+      BlockParameters* const* bp = bp_row_base;
+      for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit &&
+                              MultiplyBy4(column4x4_start + column4x4) < width_;
+           column4x4 += column_step, bp += column_step) {
+        GetVerticalDeblockFilterEdgeInfoUV(column4x4_start + column4x4, bp,
+                                           &level_u, &level_v, &column_step,
+                                           &filter_length);
+        if (level_u != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][type](
+              src_row_u, src_stride_u, outer_thresh_[level_u],
+              inner_thresh_[level_u], HevThresh(level_u));
+        }
+        if (level_v != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][type](
+              src_row_v, src_stride_v, outer_thresh_[level_v],
+              inner_thresh_[level_v], HevThresh(level_v));
+        }
+        src_row_u += column_step << column_step_shift;
+        src_row_v += column_step << column_step_shift;
+        column_step = DivideBy4(column_step << subsampling_x);
+      }
+    }
+  }
+}
+
+void PostFilter::ApplyDeblockFilterForOneSuperBlockRow(int row4x4_start,
+                                                       int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoDeblock());
+  for (int y = 0; y < sb4x4; y += 16) {
+    const int row4x4 = row4x4_start + y;
+    if (row4x4 >= frame_header_.rows4x4) break;
+    int column4x4;
+    for (column4x4 = 0; column4x4 < frame_header_.columns4x4;
+         column4x4 += kNum4x4InLoopFilterUnit) {
+      // First apply vertical filtering
+      VerticalDeblockFilter(row4x4, column4x4);
+
+      // Delay one superblock to apply horizontal filtering.
+      if (column4x4 != 0) {
+        HorizontalDeblockFilter(row4x4, column4x4 - kNum4x4InLoopFilterUnit);
+      }
+    }
+    // Horizontal filtering for the last 64x64 block.
+    HorizontalDeblockFilter(row4x4, column4x4 - kNum4x4InLoopFilterUnit);
+  }
+}
+
+template <LoopFilterType loop_filter_type>
+void PostFilter::DeblockFilterWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopFilterUnit,
+                                            std::memory_order_relaxed)) <
+         frame_header_.rows4x4) {
+    for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
+         column4x4 += kNum4x4InLoopFilterUnit) {
+      (this->*deblock_filter_func_[loop_filter_type])(row4x4, column4x4);
+    }
+  }
+}
+
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>(
+    std::atomic<int>* row4x4_atomic);
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>(
+    std::atomic<int>* row4x4_atomic);
+
+void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
+                                    int row4x4_start, int column4x4_start,
+                                    int column4x4_end, int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoDeblock());
+
+  column4x4_end = std::min(column4x4_end, frame_header_.columns4x4);
+  if (column4x4_start >= column4x4_end) return;
+
+  const DeblockFilter deblock_filter = deblock_filter_func_[loop_filter_type];
+  const int sb_height4x4 =
+      std::min(sb4x4, frame_header_.rows4x4 - row4x4_start);
+  for (int y = 0; y < sb_height4x4; y += kNum4x4InLoopFilterUnit) {
+    const int row4x4 = row4x4_start + y;
+    for (int column4x4 = column4x4_start; column4x4 < column4x4_end;
+         column4x4 += kNum4x4InLoopFilterUnit) {
+      (this->*deblock_filter)(row4x4, column4x4);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/deblock_thresholds.inc b/src/post_filter/deblock_thresholds.inc
new file mode 100644
index 0000000..ca12aaa
--- /dev/null
+++ b/src/post_filter/deblock_thresholds.inc
@@ -0,0 +1,85 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Thresholds for the deblocking filter. Precomputed values of part of Section
+// 7.14.4 for all possible values of sharpness.
+
+constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = {
+    {1,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+     32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+     48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8,
+     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
+
+constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = {
+    {5,   7,   10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,
+     43,  46,  49,  52,  55,  58,  61,  64,  67,  70,  73,  76,  79,
+     82,  85,  88,  91,  94,  97,  100, 103, 106, 109, 112, 115, 118,
+     121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157,
+     160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  39,  41,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+     64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,
+     90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112, 114,
+     116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,
+     63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,  85,  87,
+     89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109, 111, 113,
+     115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,
+     62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,
+     88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112,
+     114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  33,
+     35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,
+     61,  63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,  85,
+     87,  89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109, 111,
+     113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  31,
+     33,  35,  37,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,
+     60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,
+     86,  88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110,
+     112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  31,
+     33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,
+     59,  61,  63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,
+     85,  87,  89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109,
+     111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  30,
+     32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,
+     58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,
+     84,  86,  88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108,
+     110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}};
diff --git a/src/post_filter/loop_restoration.cc b/src/post_filter/loop_restoration.cc
new file mode 100644
index 0000000..3d5da90
--- /dev/null
+++ b/src/post_filter/loop_restoration.cc
@@ -0,0 +1,172 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneRow(
+    const Pixel* src_buffer, const ptrdiff_t stride, const Plane plane,
+    const int plane_height, const int plane_width, const int unit_y,
+    const int unit_row, const int current_process_unit_height,
+    const int plane_unit_size, Pixel* dst_buffer) {
+  const int num_horizontal_units =
+      restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
+  const RestorationUnitInfo* const restoration_info =
+      restoration_info_->loop_restoration_info(static_cast<Plane>(plane),
+                                               unit_row * num_horizontal_units);
+  const bool in_place = DoCdef() || thread_pool_ != nullptr;
+  const Pixel* border = nullptr;
+  src_buffer += unit_y * stride;
+  if (in_place) {
+    assert(loop_restoration_border_.stride(plane) ==
+           static_cast<int>(sizeof(Pixel) * stride));
+    const int border_unit_y = std::max(
+        RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+    border =
+        reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
+        border_unit_y * stride;
+  }
+  int unit_column = 0;
+  int column = 0;
+  do {
+    const int current_process_unit_width =
+        std::min(plane_unit_size, plane_width - column);
+    const Pixel* src = src_buffer + column;
+    unit_column = std::min(unit_column, num_horizontal_units - 1);
+    if (restoration_info[unit_column].type == kLoopRestorationTypeNone) {
+      Pixel* dst = dst_buffer + column;
+      if (in_place) {
+        int k = current_process_unit_height;
+        do {
+          memmove(dst, src, current_process_unit_width * sizeof(Pixel));
+          src += stride;
+          dst += stride;
+        } while (--k != 0);
+      } else {
+        CopyPlane(src, stride, current_process_unit_width,
+                  current_process_unit_height, dst, stride);
+      }
+    } else {
+      const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+      const Pixel* bottom_border = src + current_process_unit_height * stride;
+      const bool frame_bottom_border =
+          (unit_y + current_process_unit_height >= plane_height);
+      if (in_place && (unit_y != 0 || !frame_bottom_border)) {
+        const Pixel* loop_restoration_border = border + column;
+        if (unit_y != 0) {
+          top_border = loop_restoration_border;
+          loop_restoration_border += 4 * stride;
+        }
+        if (!frame_bottom_border) {
+          bottom_border =
+              loop_restoration_border + kRestorationVerticalBorder * stride;
+        }
+      }
+      RestorationBuffer restoration_buffer;
+      const LoopRestorationType type = restoration_info[unit_column].type;
+      assert(type == kLoopRestorationTypeSgrProj ||
+             type == kLoopRestorationTypeWiener);
+      const dsp::LoopRestorationFunc restoration_func =
+          dsp_.loop_restorations[type - 2];
+      restoration_func(restoration_info[unit_column], src, top_border,
+                       bottom_border, stride, current_process_unit_width,
+                       current_process_unit_height, &restoration_buffer,
+                       dst_buffer + column);
+    }
+    ++unit_column;
+    column += plane_unit_size;
+  } while (column < plane_width);
+}
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(const int row4x4_start,
+                                                         const int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoRestoration());
+  int plane = kPlaneY;
+  do {
+    if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+    const int unit_height_offset =
+        kRestorationUnitOffset >> subsampling_y_[plane];
+    const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+    const int plane_width =
+        SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+    const int plane_unit_size = 1 << loop_restoration_.unit_size_log2[plane];
+    const int plane_process_unit_height =
+        kRestorationUnitHeight >> subsampling_y_[plane];
+    int y = (row4x4_start == 0)
+                ? 0
+                : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) -
+                      unit_height_offset;
+    int expected_height = plane_process_unit_height -
+                          ((row4x4_start == 0) ? unit_height_offset : 0);
+    int current_process_unit_height;
+    for (int sb_y = 0; sb_y < sb4x4;
+         sb_y += 16, y += current_process_unit_height) {
+      if (y >= plane_height) break;
+      const int unit_row = std::min(
+          (y + unit_height_offset) >> loop_restoration_.unit_size_log2[plane],
+          restoration_info_->num_vertical_units(static_cast<Plane>(plane)) - 1);
+      current_process_unit_height = std::min(expected_height, plane_height - y);
+      expected_height = plane_process_unit_height;
+      ApplyLoopRestorationForOneRow<Pixel>(
+          reinterpret_cast<Pixel*>(superres_buffer_[plane]), stride,
+          static_cast<Plane>(plane), plane_height, plane_width, y, unit_row,
+          current_process_unit_height, plane_unit_size,
+          reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
+              y * stride);
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth_ >= 10) {
+    ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(row4x4_start, sb4x4);
+    return;
+  }
+#endif
+  ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(row4x4_start, sb4x4);
+}
+
+void PostFilter::ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  // Loop Restoration operates with a lag of 8 rows (4 for chroma with
+  // subsampling) and hence we need to make sure to cover the last 8 rows of the
+  // last superblock row. So we run this loop for an extra iteration to
+  // accomplish that.
+  const int row4x4_end = frame_header_.rows4x4 + kNum4x4InLoopRestorationUnit;
+  while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopRestorationUnit,
+                                            std::memory_order_relaxed)) <
+         row4x4_end) {
+    CopyBordersForOneSuperBlockRow(row4x4, kNum4x4InLoopRestorationUnit,
+                                   /*for_loop_restoration=*/true);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(
+          row4x4, kNum4x4InLoopRestorationUnit);
+      continue;
+    }
+#endif
+    ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(
+        row4x4, kNum4x4InLoopRestorationUnit);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/post_filter.cc b/src/post_filter/post_filter.cc
new file mode 100644
index 0000000..0eacf34
--- /dev/null
+++ b/src/post_filter/post_filter.cc
@@ -0,0 +1,601 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/post_filter.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/post_filter/deblock_thresholds.inc"
+
+// Row indices of loop restoration border. This is used to populate the
+// |loop_restoration_border_| when either cdef is on or multithreading is
+// enabled. The dimension is subsampling_y.
+constexpr int kLoopRestorationBorderRows[2] = {54, 26};
+
+}  // namespace
+
+// The following example illustrates how ExtendFrame() extends a frame.
+// Suppose the frame width is 8 and height is 4, and left, right, top, and
+// bottom are all equal to 3.
+//
+// Before:
+//
+//       ABCDEFGH
+//       IJKLMNOP
+//       QRSTUVWX
+//       YZabcdef
+//
+// After:
+//
+//   AAA|ABCDEFGH|HHH  [3]
+//   AAA|ABCDEFGH|HHH
+//   AAA|ABCDEFGH|HHH
+//   ---+--------+---
+//   AAA|ABCDEFGH|HHH  [1]
+//   III|IJKLMNOP|PPP
+//   QQQ|QRSTUVWX|XXX
+//   YYY|YZabcdef|fff
+//   ---+--------+---
+//   YYY|YZabcdef|fff  [2]
+//   YYY|YZabcdef|fff
+//   YYY|YZabcdef|fff
+//
+// ExtendFrame() first extends the rows to the left and to the right[1]. Then
+// it copies the extended last row to the bottom borders[2]. Finally it copies
+// the extended first row to the top borders[3].
+// static
+template <typename Pixel>
+void PostFilter::ExtendFrame(Pixel* const frame_start, const int width,
+                             const int height, const ptrdiff_t stride,
+                             const int left, const int right, const int top,
+                             const int bottom) {
+  Pixel* src = frame_start;
+  // Copy to left and right borders.
+  int y = height;
+  do {
+    ExtendLine<Pixel>(src, width, left, right);
+    src += stride;
+  } while (--y != 0);
+  // Copy to bottom borders. For performance we copy |stride| pixels
+  // (including some padding pixels potentially) in each row, ending at the
+  // bottom right border pixel. In the diagram the asterisks indicate padding
+  // pixels.
+  //
+  // |<--- stride --->|
+  // **YYY|YZabcdef|fff <-- Copy from the extended last row.
+  // -----+--------+---
+  // **YYY|YZabcdef|fff
+  // **YYY|YZabcdef|fff
+  // **YYY|YZabcdef|fff <-- bottom right border pixel
+  assert(src == frame_start + height * stride);
+  Pixel* dst = src - left;
+  src = dst - stride;
+  for (int y = 0; y < bottom; ++y) {
+    memcpy(dst, src, sizeof(Pixel) * stride);
+    dst += stride;
+  }
+  // Copy to top borders. For performance we copy |stride| pixels (including
+  // some padding pixels potentially) in each row, starting from the top left
+  // border pixel. In the diagram the asterisks indicate padding pixels.
+  //
+  // +-- top left border pixel
+  // |
+  // v
+  // AAA|ABCDEFGH|HHH**
+  // AAA|ABCDEFGH|HHH**
+  // AAA|ABCDEFGH|HHH**
+  // ---+--------+-----
+  // AAA|ABCDEFGH|HHH** <-- Copy from the extended first row.
+  // |<--- stride --->|
+  src = frame_start - left;
+  dst = frame_start - left - top * stride;
+  for (int y = 0; y < top; ++y) {
+    memcpy(dst, src, sizeof(Pixel) * stride);
+    dst += stride;
+  }
+}
+
+template void PostFilter::ExtendFrame<uint8_t>(uint8_t* const frame_start,
+                                               const int width,
+                                               const int height,
+                                               const ptrdiff_t stride,
+                                               const int left, const int right,
+                                               const int top, const int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void PostFilter::ExtendFrame<uint16_t>(
+    uint16_t* const frame_start, const int width, const int height,
+    const ptrdiff_t stride, const int left, const int right, const int top,
+    const int bottom);
+#endif
+
+PostFilter::PostFilter(const ObuFrameHeader& frame_header,
+                       const ObuSequenceHeader& sequence_header,
+                       FrameScratchBuffer* const frame_scratch_buffer,
+                       YuvBuffer* const frame_buffer, const dsp::Dsp* dsp,
+                       int do_post_filter_mask)
+    : frame_header_(frame_header),
+      loop_restoration_(frame_header.loop_restoration),
+      dsp_(*dsp),
+      // Deblocking filter always uses 64x64 as step size.
+      num_64x64_blocks_per_row_(DivideBy64(frame_header.width + 63)),
+      upscaled_width_(frame_header.upscaled_width),
+      width_(frame_header.width),
+      height_(frame_header.height),
+      bitdepth_(sequence_header.color_config.bitdepth),
+      subsampling_x_{0, sequence_header.color_config.subsampling_x,
+                     sequence_header.color_config.subsampling_x},
+      subsampling_y_{0, sequence_header.color_config.subsampling_y,
+                     sequence_header.color_config.subsampling_y},
+      planes_(sequence_header.color_config.is_monochrome ? kMaxPlanesMonochrome
+                                                         : kMaxPlanes),
+      pixel_size_log2_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
+                                                         : sizeof(uint16_t)) -
+                       1),
+      inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
+      outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
+      needs_chroma_deblock_(frame_header.loop_filter.level[kPlaneU + 1] != 0 ||
+                            frame_header.loop_filter.level[kPlaneV + 1] != 0),
+      cdef_index_(frame_scratch_buffer->cdef_index),
+      inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+      restoration_info_(&frame_scratch_buffer->loop_restoration_info),
+      superres_coefficients_{
+          frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(),
+          frame_scratch_buffer
+              ->superres_coefficients
+                  [(sequence_header.color_config.is_monochrome ||
+                    sequence_header.color_config.subsampling_x == 0)
+                       ? kPlaneTypeY
+                       : kPlaneTypeUV]
+              .get()},
+      superres_line_buffer_(frame_scratch_buffer->superres_line_buffer),
+      block_parameters_(frame_scratch_buffer->block_parameters_holder),
+      frame_buffer_(*frame_buffer),
+      cdef_border_(frame_scratch_buffer->cdef_border),
+      loop_restoration_border_(frame_scratch_buffer->loop_restoration_border),
+      do_post_filter_mask_(do_post_filter_mask),
+      thread_pool_(
+          frame_scratch_buffer->threading_strategy.post_filter_thread_pool()) {
+  const int8_t zero_delta_lf[kFrameLfCount] = {};
+  ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
+  if (DoSuperRes()) {
+    int plane = kPlaneY;
+    do {
+      const int downscaled_width =
+          SubsampledValue(width_, subsampling_x_[plane]);
+      const int upscaled_width =
+          SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+      const int superres_width = downscaled_width << kSuperResScaleBits;
+      super_res_info_[plane].step =
+          (superres_width + upscaled_width / 2) / upscaled_width;
+      const int error =
+          super_res_info_[plane].step * upscaled_width - superres_width;
+      super_res_info_[plane].initial_subpixel_x =
+          ((-((upscaled_width - downscaled_width) << (kSuperResScaleBits - 1)) +
+            DivideBy2(upscaled_width)) /
+               upscaled_width +
+           (1 << (kSuperResExtraBits - 1)) - error / 2) &
+          kSuperResScaleMask;
+      super_res_info_[plane].upscaled_width = upscaled_width;
+    } while (++plane < planes_);
+    if (dsp->super_res_coefficients != nullptr) {
+      int plane = kPlaneY;
+      const int number_loops = (superres_coefficients_[kPlaneTypeY] ==
+                                superres_coefficients_[kPlaneTypeUV])
+                                   ? kMaxPlanesMonochrome
+                                   : static_cast<int>(kNumPlaneTypes);
+      do {
+        dsp->super_res_coefficients(
+            SubsampledValue(upscaled_width_, subsampling_x_[plane]),
+            super_res_info_[plane].initial_subpixel_x,
+            super_res_info_[plane].step, superres_coefficients_[plane]);
+      } while (++plane < number_loops);
+    }
+  }
+  int plane = kPlaneY;
+  do {
+    loop_restoration_buffer_[plane] = frame_buffer_.data(plane);
+    cdef_buffer_[plane] = frame_buffer_.data(plane);
+    superres_buffer_[plane] = frame_buffer_.data(plane);
+    source_buffer_[plane] = frame_buffer_.data(plane);
+  } while (++plane < planes_);
+  if (DoCdef() || DoRestoration() || DoSuperRes()) {
+    plane = kPlaneY;
+    const int pixel_size_log2 = pixel_size_log2_;
+    do {
+      int horizontal_shift = 0;
+      int vertical_shift = 0;
+      if (DoRestoration() &&
+          loop_restoration_.type[plane] != kLoopRestorationTypeNone) {
+        horizontal_shift += frame_buffer_.alignment();
+        if (!DoCdef() && thread_pool_ == nullptr) {
+          vertical_shift += kRestorationVerticalBorder;
+        }
+        superres_buffer_[plane] +=
+            vertical_shift * frame_buffer_.stride(plane) +
+            (horizontal_shift << pixel_size_log2);
+      }
+      if (DoSuperRes()) {
+        vertical_shift += kSuperResVerticalBorder;
+      }
+      cdef_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+                             (horizontal_shift << pixel_size_log2);
+      if (DoCdef() && thread_pool_ == nullptr) {
+        horizontal_shift += frame_buffer_.alignment();
+        vertical_shift += kCdefBorder;
+      }
+      assert(horizontal_shift <= frame_buffer_.right_border(plane));
+      assert(vertical_shift <= frame_buffer_.bottom_border(plane));
+      source_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+                               (horizontal_shift << pixel_size_log2);
+    } while (++plane < planes_);
+  }
+}
+
+void PostFilter::ExtendFrameBoundary(uint8_t* const frame_start,
+                                     const int width, const int height,
+                                     const ptrdiff_t stride, const int left,
+                                     const int right, const int top,
+                                     const int bottom) const {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth_ >= 10) {
+    ExtendFrame<uint16_t>(reinterpret_cast<uint16_t*>(frame_start), width,
+                          height, stride / sizeof(uint16_t), left, right, top,
+                          bottom);
+    return;
+  }
+#endif
+  ExtendFrame<uint8_t>(frame_start, width, height, stride, left, right, top,
+                       bottom);
+}
+
+void PostFilter::ExtendBordersForReferenceFrame() {
+  if (frame_header_.refresh_frame_flags == 0) return;
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+    const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+    assert(frame_buffer_.left_border(plane) >= kMinLeftBorderPixels &&
+           frame_buffer_.right_border(plane) >= kMinRightBorderPixels &&
+           frame_buffer_.top_border(plane) >= kMinTopBorderPixels &&
+           frame_buffer_.bottom_border(plane) >= kMinBottomBorderPixels);
+    // plane subsampling_x_ left_border
+    //   Y        N/A         64, 48
+    //  U,V        0          64, 48
+    //  U,V        1          32, 16
+    assert(frame_buffer_.left_border(plane) >= 16);
+    // The |left| argument to ExtendFrameBoundary() must be at least
+    // kMinLeftBorderPixels (13) for warp.
+    static_assert(16 >= kMinLeftBorderPixels, "");
+    ExtendFrameBoundary(
+        frame_buffer_.data(plane), plane_width, plane_height,
+        frame_buffer_.stride(plane), frame_buffer_.left_border(plane),
+        frame_buffer_.right_border(plane), frame_buffer_.top_border(plane),
+        frame_buffer_.bottom_border(plane));
+  } while (++plane < planes_);
+}
+
+void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
+  assert(frame_buffer_.stride(plane) == loop_restoration_border_.stride(plane));
+  const ptrdiff_t stride = frame_buffer_.stride(plane);
+  const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
+  const int row_offset = DivideBy4(row4x4);
+  uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride;
+  const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
+                                         subsampling_x_[plane]);
+  const int row_width = num_pixels << pixel_size_log2_;
+  int last_valid_row = -1;
+  const int plane_height =
+      SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+  int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+  const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+  for (int i = 0; i < 4; ++i, ++row) {
+    if (absolute_row + i >= plane_height) {
+      if (last_valid_row == -1) break;
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      row = last_valid_row;
+    }
+    memcpy(dst, src + row * stride, row_width);
+    last_valid_row = row;
+    dst += stride;
+  }
+}
+
+void PostFilter::CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+                                                bool for_loop_restoration) {
+  // Number of rows to be subtracted from the start position described by
+  // row4x4. We always lag by 8 rows (to account for in-loop post filters).
+  const int row_offset = (row4x4 == 0) ? 0 : 8;
+  // Number of rows to be subtracted from the height described by sb4x4.
+  const int height_offset = (row4x4 == 0) ? 8 : 0;
+  // If cdef is off and post filter multithreading is off, then loop restoration
+  // needs 2 extra rows for the bottom border in each plane.
+  const int extra_rows =
+      (for_loop_restoration && thread_pool_ == nullptr && !DoCdef()) ? 2 : 0;
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+    const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+    const int row = (MultiplyBy4(row4x4) - row_offset) >> subsampling_y_[plane];
+    assert(row >= 0);
+    if (row >= plane_height) break;
+    const int num_rows =
+        std::min(SubsampledValue(MultiplyBy4(sb4x4) - height_offset,
+                                 subsampling_y_[plane]) +
+                     extra_rows,
+                 plane_height - row);
+    // We only need to track the progress of the Y plane since the progress of
+    // the U and V planes will be inferred from the progress of the Y plane.
+    if (!for_loop_restoration && plane == kPlaneY) {
+      progress_row_ = row + num_rows;
+    }
+    const bool copy_bottom = row + num_rows == plane_height;
+    const int stride = frame_buffer_.stride(plane);
+    uint8_t* const start = (for_loop_restoration ? superres_buffer_[plane]
+                                                 : frame_buffer_.data(plane)) +
+                           row * stride;
+    const int left_border = for_loop_restoration
+                                ? kRestorationHorizontalBorder
+                                : frame_buffer_.left_border(plane);
+    const int right_border = for_loop_restoration
+                                 ? kRestorationHorizontalBorder
+                                 : frame_buffer_.right_border(plane);
+    const int top_border =
+        (row == 0) ? (for_loop_restoration ? kRestorationVerticalBorder
+                                           : frame_buffer_.top_border(plane))
+                   : 0;
+    const int bottom_border =
+        copy_bottom
+            ? (for_loop_restoration ? kRestorationVerticalBorder
+                                    : frame_buffer_.bottom_border(plane))
+            : 0;
+    ExtendFrameBoundary(start, plane_width, num_rows, stride, left_border,
+                        right_border, top_border, bottom_border);
+  } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
+  assert(row4x4 >= 0);
+  assert(!DoCdef());
+  assert(DoRestoration());
+  int plane = kPlaneY;
+  do {
+    if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    assert(frame_buffer_.stride(plane) ==
+           loop_restoration_border_.stride(plane));
+    const ptrdiff_t stride = frame_buffer_.stride(plane);
+    const int row_offset = DivideBy4(row4x4);
+    const int num_pixels =
+        SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+    const int row_width = num_pixels << pixel_size_log2_;
+    const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+    const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+    const int absolute_row =
+        (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+    const uint8_t* src =
+        GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) + row * stride;
+    uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride;
+    for (int i = 0; i < 4; ++i) {
+      memcpy(dst, src, row_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      if (bitdepth_ >= 10) {
+        ExtendLine<uint16_t>(dst, num_pixels, kRestorationHorizontalBorder,
+                             kRestorationHorizontalBorder);
+      } else  // NOLINT.
+#endif
+        ExtendLine<uint8_t>(dst, num_pixels, kRestorationHorizontalBorder,
+                            kRestorationHorizontalBorder);
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      if (absolute_row + i < plane_height - 1) src += stride;
+      dst += stride;
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoCdef());
+  assert(DoRestoration());
+  for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
+    const int row4x4 = row4x4_start + sb_y;
+    const int row_offset_start = DivideBy4(row4x4);
+    std::array<uint8_t*, kMaxPlanes> dst = {
+        loop_restoration_border_.data(kPlaneY) +
+            row_offset_start * loop_restoration_border_.stride(kPlaneY),
+        loop_restoration_border_.data(kPlaneU) +
+            row_offset_start * loop_restoration_border_.stride(kPlaneU),
+        loop_restoration_border_.data(kPlaneV) +
+            row_offset_start * loop_restoration_border_.stride(kPlaneV)};
+    // If SuperRes is enabled, then we apply SuperRes for the rows to be copied
+    // directly with |loop_restoration_border_| as the destination. Otherwise,
+    // we simply copy the rows.
+    if (DoSuperRes()) {
+      std::array<uint8_t*, kMaxPlanes> src;
+      std::array<int, kMaxPlanes> rows;
+      int plane = kPlaneY;
+      do {
+        if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+          rows[plane] = 0;
+          continue;
+        }
+        const int plane_height =
+            SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+        const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+        const int absolute_row =
+            (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+        src[plane] = GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+                     row * frame_buffer_.stride(plane);
+        rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
+      } while (++plane < planes_);
+      ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      plane = kPlaneY;
+      do {
+        if (rows[plane] == 0 || rows[plane] >= 4) continue;
+        const ptrdiff_t stride = frame_buffer_.stride(plane);
+        uint8_t* dst_line = dst[plane] + rows[plane] * stride;
+        const uint8_t* const src_line = dst_line - stride;
+        const int upscaled_width = super_res_info_[plane].upscaled_width
+                                   << pixel_size_log2_;
+        for (int i = rows[plane]; i < 4; ++i) {
+          memcpy(dst_line, src_line, upscaled_width);
+          dst_line += stride;
+        }
+      } while (++plane < planes_);
+    } else {
+      int plane = kPlaneY;
+      do {
+        CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
+      } while (++plane < planes_);
+    }
+    // Extend the left and right boundaries needed for loop restoration.
+    int plane = kPlaneY;
+    do {
+      if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+        continue;
+      }
+      uint8_t* dst_line = dst[plane];
+      const int plane_width =
+          SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+      for (int i = 0; i < 4; ++i) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        if (bitdepth_ >= 10) {
+          ExtendLine<uint16_t>(dst_line, plane_width,
+                               kRestorationHorizontalBorder,
+                               kRestorationHorizontalBorder);
+        } else  // NOLINT.
+#endif
+        {
+          ExtendLine<uint8_t>(dst_line, plane_width,
+                              kRestorationHorizontalBorder,
+                              kRestorationHorizontalBorder);
+        }
+        dst_line += loop_restoration_border_.stride(plane);
+      }
+    } while (++plane < planes_);
+  }
+}
+
+void PostFilter::RunJobs(WorkerFunction worker) {
+  std::atomic<int> row4x4(0);
+  const int num_workers = thread_pool_->num_threads();
+  BlockingCounter pending_workers(num_workers);
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool_->Schedule([this, &row4x4, &pending_workers, worker]() {
+      (this->*worker)(&row4x4);
+      pending_workers.Decrement();
+    });
+  }
+  // Run the jobs on the current thread.
+  (this->*worker)(&row4x4);
+  // Wait for the threadpool jobs to finish.
+  pending_workers.Wait();
+}
+
+void PostFilter::ApplyFilteringThreaded() {
+  if (DoDeblock()) {
+    RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>);
+    RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>);
+  }
+  if (DoCdef() && DoRestoration()) {
+    for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+         row4x4 += kNum4x4InLoopFilterUnit) {
+      SetupLoopRestorationBorder(row4x4, kNum4x4InLoopFilterUnit);
+    }
+  }
+  if (DoCdef()) {
+    for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+         row4x4 += kNum4x4InLoopFilterUnit) {
+      SetupCdefBorder(row4x4);
+    }
+    RunJobs(&PostFilter::ApplyCdefWorker);
+  }
+  if (DoSuperRes()) ApplySuperResThreaded();
+  if (DoRestoration()) {
+    if (!DoCdef()) {
+      int row4x4 = 0;
+      do {
+        SetupLoopRestorationBorder(row4x4);
+        row4x4 += kNum4x4InLoopFilterUnit;
+      } while (row4x4 < frame_header_.rows4x4);
+    }
+    RunJobs(&PostFilter::ApplyLoopRestorationWorker);
+  }
+  ExtendBordersForReferenceFrame();
+}
+
+int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
+                                                  bool is_last_row,
+                                                  bool do_deblock) {
+  if (row4x4 < 0) return -1;
+  if (DoDeblock() && do_deblock) {
+    ApplyDeblockFilterForOneSuperBlockRow(row4x4, sb4x4);
+  }
+  if (DoRestoration() && DoCdef()) {
+    SetupLoopRestorationBorder(row4x4, sb4x4);
+  }
+  if (DoCdef()) {
+    ApplyCdefForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+  }
+  if (DoSuperRes()) {
+    ApplySuperResForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+  }
+  if (DoRestoration()) {
+    CopyBordersForOneSuperBlockRow(row4x4, sb4x4, true);
+    ApplyLoopRestoration(row4x4, sb4x4);
+    if (is_last_row) {
+      // Loop restoration operates with a lag of 8 rows. So make sure to cover
+      // all the rows of the last superblock row.
+      CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, true);
+      ApplyLoopRestoration(row4x4 + sb4x4, 16);
+    }
+  }
+  if (frame_header_.refresh_frame_flags != 0 && DoBorderExtensionInLoop()) {
+    CopyBordersForOneSuperBlockRow(row4x4, sb4x4, false);
+    if (is_last_row) {
+      CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, false);
+    }
+  }
+  if (is_last_row && !DoBorderExtensionInLoop()) {
+    ExtendBordersForReferenceFrame();
+  }
+  return is_last_row ? height_ : progress_row_;
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/super_res.cc b/src/post_filter/super_res.cc
new file mode 100644
index 0000000..a70e4ed
--- /dev/null
+++ b/src/post_filter/super_res.cc
@@ -0,0 +1,199 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
+                               const std::array<int, kMaxPlanes>& rows,
+                               const int line_buffer_row,
+                               const std::array<uint8_t*, kMaxPlanes>& dst) {
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      auto* input = reinterpret_cast<uint16_t*>(src[plane]);
+      auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
+      const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(uint16_t);
+      if (rows[plane] > 0) {
+        dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                       input, stride, rows[plane], plane_width,
+                       super_res_info_[plane].upscaled_width,
+                       super_res_info_[plane].initial_subpixel_x,
+                       super_res_info_[plane].step, output);
+      }
+      // In the multi-threaded case, the |superres_line_buffer_| holds the last
+      // input row. Apply SuperRes for that row.
+      if (line_buffer_row >= 0) {
+        auto* const line_buffer_start =
+            reinterpret_cast<uint16_t*>(superres_line_buffer_.data(plane)) +
+            line_buffer_row * superres_line_buffer_.stride(plane) /
+                sizeof(uint16_t) +
+            kSuperResHorizontalBorder;
+        dsp_.super_res(
+            superres_coefficients_[static_cast<int>(plane != 0)],
+            line_buffer_start, /*stride=*/0,
+            /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+            super_res_info_[plane].initial_subpixel_x,
+            super_res_info_[plane].step, output + rows[plane] * stride);
+      }
+      continue;
+    }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    uint8_t* input = src[plane];
+    uint8_t* output = dst[plane];
+    if (rows[plane] > 0) {
+      dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                     input, frame_buffer_.stride(plane), rows[plane],
+                     plane_width, super_res_info_[plane].upscaled_width,
+                     super_res_info_[plane].initial_subpixel_x,
+                     super_res_info_[plane].step, output);
+    }
+    // In the multi-threaded case, the |superres_line_buffer_| holds the last
+    // input row. Apply SuperRes for that row.
+    if (line_buffer_row >= 0) {
+      uint8_t* const line_buffer_start =
+          superres_line_buffer_.data(plane) +
+          line_buffer_row * superres_line_buffer_.stride(plane) +
+          kSuperResHorizontalBorder;
+      dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                     line_buffer_start, /*stride=*/0,
+                     /*height=*/1, plane_width,
+                     super_res_info_[plane].upscaled_width,
+                     super_res_info_[plane].initial_subpixel_x,
+                     super_res_info_[plane].step,
+                     output + rows[plane] * frame_buffer_.stride(plane));
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4,
+                                                  bool is_last_row) {
+  assert(row4x4_start >= 0);
+  assert(DoSuperRes());
+  // If not doing cdef, then LR needs two rows of border with superres applied.
+  const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2;
+  std::array<uint8_t*, kMaxPlanes> src;
+  std::array<uint8_t*, kMaxPlanes> dst;
+  std::array<int, kMaxPlanes> rows;
+  const int num_rows4x4 =
+      std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) -
+      (is_last_row ? 0 : 2);
+  if (row4x4_start > 0) {
+    const int row4x4 = row4x4_start - 2;
+    int plane = kPlaneY;
+    do {
+      const int row =
+          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
+      const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
+      src[plane] = cdef_buffer_[plane] + row_offset;
+      dst[plane] = superres_buffer_[plane] + row_offset;
+      // Note that the |num_rows_extra| subtraction is done after the value is
+      // subsampled since we always need to work on |num_rows_extra| extra rows
+      // irrespective of the plane subsampling.
+      // Apply superres for the last 8-|num_rows_extra| rows of the previous
+      // superblock.
+      rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
+      // Apply superres for the current superblock row (except for the last
+      // 8-|num_rows_extra| rows).
+      rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+                     (is_last_row ? 0 : num_rows_extra);
+    } while (++plane < planes_);
+  } else {
+    // Apply superres for the current superblock row (except for the last
+    // 8-|num_rows_extra| rows).
+    int plane = kPlaneY;
+    do {
+      const ptrdiff_t row_offset =
+          (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
+          frame_buffer_.stride(plane);
+      src[plane] = cdef_buffer_[plane] + row_offset;
+      dst[plane] = superres_buffer_[plane] + row_offset;
+      // Note that the |num_rows_extra| addition is done after the value is
+      // subsampled since we always need to work on |num_rows_extra| extra rows
+      // irrespective of the plane subsampling.
+      rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+                    (is_last_row ? 0 : num_rows_extra);
+    } while (++plane < planes_);
+  }
+  ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+}
+
+void PostFilter::ApplySuperResThreaded() {
+  int num_threads = thread_pool_->num_threads() + 1;
+  // The number of rows that will be processed by each thread in the thread pool
+  // (other than the current thread).
+  int thread_pool_rows = height_ / num_threads;
+  thread_pool_rows = std::max(thread_pool_rows, 1);
+  // Make rows of Y plane even when there is subsampling for the other planes.
+  if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+    ++thread_pool_rows;
+  }
+  // Adjust the number of threads to what we really need.
+  num_threads = Clip3(height_ / thread_pool_rows, 1, num_threads);
+  // For the current thread, we round up to process all the remaining rows.
+  int current_thread_rows = height_ - thread_pool_rows * (num_threads - 1);
+  // Make rows of Y plane even when there is subsampling for the other planes.
+  if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+    ++current_thread_rows;
+  }
+  assert(current_thread_rows > 0);
+  BlockingCounter pending_workers(num_threads - 1);
+  for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads;
+       ++line_buffer_row, row_start += thread_pool_rows) {
+    std::array<uint8_t*, kMaxPlanes> src;
+    std::array<uint8_t*, kMaxPlanes> dst;
+    std::array<int, kMaxPlanes> rows;
+    int plane = kPlaneY;
+    const int pixel_size_log2 = pixel_size_log2_;
+    do {
+      src[plane] =
+          GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+                          static_cast<Plane>(plane), row_start, 0);
+      dst[plane] =
+          GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+                          static_cast<Plane>(plane), row_start, 0);
+      rows[plane] =
+          (((line_buffer_row < num_threads - 1) ? thread_pool_rows
+                                                : current_thread_rows) >>
+           subsampling_y_[plane]) -
+          1;
+      const int plane_width =
+          MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+      uint8_t* const input =
+          src[plane] + rows[plane] * frame_buffer_.stride(plane);
+      uint8_t* const line_buffer_start =
+          superres_line_buffer_.data(plane) +
+          line_buffer_row * superres_line_buffer_.stride(plane) +
+          (kSuperResHorizontalBorder << pixel_size_log2);
+      memcpy(line_buffer_start, input, plane_width << pixel_size_log2);
+    } while (++plane < planes_);
+    if (line_buffer_row < num_threads - 1) {
+      thread_pool_->Schedule(
+          [this, src, rows, line_buffer_row, dst, &pending_workers]() {
+            ApplySuperRes(src, rows, line_buffer_row, dst);
+            pending_workers.Decrement();
+          });
+    } else {
+      ApplySuperRes(src, rows, line_buffer_row, dst);
+    }
+  }
+  // Wait for the threadpool jobs to finish.
+  pending_workers.Wait();
+}
+
+}  // namespace libgav1
diff --git a/src/prediction_mask.cc b/src/prediction_mask.cc
new file mode 100644
index 0000000..ab4d849
--- /dev/null
+++ b/src/prediction_mask.cc
@@ -0,0 +1,236 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/prediction_mask.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWedgeDirectionTypes = 16;
+
+enum kWedgeDirection : uint8_t {
+  kWedgeHorizontal,
+  kWedgeVertical,
+  kWedgeOblique27,
+  kWedgeOblique63,
+  kWedgeOblique117,
+  kWedgeOblique153,
+};
+
+constexpr uint8_t kWedgeCodebook[3][16][3] = {{{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeHorizontal, 4, 2},
+                                               {kWedgeHorizontal, 4, 4},
+                                               {kWedgeHorizontal, 4, 6},
+                                               {kWedgeVertical, 4, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}},
+                                              {{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeVertical, 2, 4},
+                                               {kWedgeVertical, 4, 4},
+                                               {kWedgeVertical, 6, 4},
+                                               {kWedgeHorizontal, 4, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}},
+                                              {{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeHorizontal, 4, 2},
+                                               {kWedgeHorizontal, 4, 6},
+                                               {kWedgeVertical, 2, 4},
+                                               {kWedgeVertical, 6, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}}};
+
+constexpr BitMaskSet kWedgeFlipSignMasks[9] = {
+    BitMaskSet(0xBBFF),  // kBlock8x8
+    BitMaskSet(0xBBEF),  // kBlock8x16
+    BitMaskSet(0xBAEF),  // kBlock8x32
+    BitMaskSet(0xBBEF),  // kBlock16x8
+    BitMaskSet(0xBBFF),  // kBlock16x16
+    BitMaskSet(0xBBEF),  // kBlock16x32
+    BitMaskSet(0xABEF),  // kBlock32x8
+    BitMaskSet(0xBBEF),  // kBlock32x16
+    BitMaskSet(0xBBFF)   // kBlock32x32
+};
+
+// This table (and the one below) contains a few leading zeros and trailing 64s
+// to avoid some additional memcpys where it is actually used.
+constexpr uint8_t kWedgeMasterObliqueOdd[kWedgeMaskMasterSize * 3 / 2] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18, 37,
+    53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterObliqueEven[kWedgeMaskMasterSize * 3 / 2] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
+    46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterVertical[kWedgeMaskMasterSize] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
+    43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+int BlockShape(BlockSize block_size) {
+  const int width = kNum4x4BlocksWide[block_size];
+  const int height = kNum4x4BlocksHigh[block_size];
+  if (height > width) return 0;
+  if (height < width) return 1;
+  return 2;
+}
+
+uint8_t GetWedgeDirection(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][0];
+}
+
+uint8_t GetWedgeOffsetX(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][1];
+}
+
+uint8_t GetWedgeOffsetY(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][2];
+}
+
+}  // namespace
+
+bool GenerateWedgeMask(WedgeMaskArray* const wedge_masks) {
+  // Generate master masks.
+  uint8_t master_mask[6][kWedgeMaskMasterSize][kWedgeMaskMasterSize];
+  for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+    memcpy(master_mask[kWedgeVertical][y], kWedgeMasterVertical,
+           kWedgeMaskMasterSize);
+  }
+
+  for (int y = 0, shift = 0; y < kWedgeMaskMasterSize; y += 2, ++shift) {
+    memcpy(master_mask[kWedgeOblique63][y], kWedgeMasterObliqueEven + shift,
+           kWedgeMaskMasterSize);
+    memcpy(master_mask[kWedgeOblique63][y + 1], kWedgeMasterObliqueOdd + shift,
+           kWedgeMaskMasterSize);
+  }
+
+  for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+    for (int x = 0; x < kWedgeMaskMasterSize; ++x) {
+      const uint8_t mask_value = master_mask[kWedgeOblique63][y][x];
+      master_mask[kWedgeHorizontal][x][y] = master_mask[kWedgeVertical][y][x];
+      master_mask[kWedgeOblique27][x][y] = mask_value;
+      master_mask[kWedgeOblique117][y][kWedgeMaskMasterSize - 1 - x] =
+          64 - mask_value;
+      master_mask[kWedgeOblique153][(kWedgeMaskMasterSize - 1 - x)][y] =
+          64 - mask_value;
+    }
+  }
+
+  // Generate wedge masks.
+  int block_size_index = 0;
+  for (int size = kBlock8x8; size <= kBlock32x32; ++size) {
+    if (!kIsWedgeCompoundModeAllowed.Contains(size)) continue;
+
+    const int width = kBlockWidthPixels[size];
+    const int height = kBlockHeightPixels[size];
+    assert(width >= 8);
+    assert(width <= 32);
+    assert(height >= 8);
+    assert(height <= 32);
+
+    const auto block_size = static_cast<BlockSize>(size);
+    for (int wedge_index = 0; wedge_index < kWedgeDirectionTypes;
+         ++wedge_index) {
+      const uint8_t direction = GetWedgeDirection(block_size, wedge_index);
+      const uint8_t offset_x =
+          DivideBy2(kWedgeMaskMasterSize) -
+          ((GetWedgeOffsetX(block_size, wedge_index) * width) >> 3);
+      const uint8_t offset_y =
+          DivideBy2(kWedgeMaskMasterSize) -
+          ((GetWedgeOffsetY(block_size, wedge_index) * height) >> 3);
+
+      // Allocate the 2d array.
+      for (int flip_sign = 0; flip_sign < 2; ++flip_sign) {
+        if (!((*wedge_masks)[block_size_index][flip_sign][wedge_index].Reset(
+                height, width, /*zero_initialize=*/false))) {
+          LIBGAV1_DLOG(ERROR, "Failed to allocate memory for wedge masks.");
+          return false;
+        }
+      }
+
+      const auto flip_sign = static_cast<uint8_t>(
+          kWedgeFlipSignMasks[block_size_index].Contains(wedge_index));
+      uint8_t* wedge_masks_row =
+          (*wedge_masks)[block_size_index][flip_sign][wedge_index][0];
+      uint8_t* wedge_masks_row_flip =
+          (*wedge_masks)[block_size_index][1 - flip_sign][wedge_index][0];
+      uint8_t* master_mask_row = &master_mask[direction][offset_y][offset_x];
+      for (int y = 0; y < height; ++y) {
+        memcpy(wedge_masks_row, master_mask_row, width);
+        for (int x = 0; x < width; ++x) {
+          wedge_masks_row_flip[x] = 64 - wedge_masks_row[x];
+        }
+        wedge_masks_row += width;
+        wedge_masks_row_flip += width;
+        master_mask_row += kWedgeMaskMasterSize;
+      }
+    }
+
+    block_size_index++;
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/prediction_mask.h b/src/prediction_mask.h
new file mode 100644
index 0000000..0134a0d
--- /dev/null
+++ b/src/prediction_mask.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_PREDICTION_MASK_H_
+#define LIBGAV1_SRC_PREDICTION_MASK_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr BitMaskSet kIsWedgeCompoundModeAllowed(kBlock8x8, kBlock8x16,
+                                                 kBlock8x32, kBlock16x8,
+                                                 kBlock16x16, kBlock16x32,
+                                                 kBlock32x8, kBlock32x16,
+                                                 kBlock32x32);
+
+// This function generates wedge masks. It should be called only once for the
+// decoder. If the video is key frame only, we don't have to call this
+// function. Returns true on success, false on allocation failure.
+// 7.11.3.11.
+bool GenerateWedgeMask(WedgeMaskArray* wedge_masks);
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_PREDICTION_MASK_H_
diff --git a/src/quantizer.cc b/src/quantizer.cc
new file mode 100644
index 0000000..cd720d6
--- /dev/null
+++ b/src/quantizer.cc
@@ -0,0 +1,269 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/quantizer.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10
+#error LIBGAV1_MAX_BITDEPTH must be 8 or 10
+#endif
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/quantizer_tables.inc"
+
+// Format the kDcLookup and kAcLookup arrays manually for easier comparison
+// with the Dc_Qlookup and Ac_Qlookup arrays in Section 7.12.2.
+
+// clang-format off
+constexpr int16_t kDcLookup[][256] = {
+  // Lookup table for 8 bit.
+  {
+    4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16,
+    17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26,
+    27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37,
+    38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47,
+    48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57,
+    57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66,
+    67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76,
+    77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85,
+    87, 88, 90, 92, 93, 95, 96, 98, 99, 101, 102, 104,
+    105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121,
+    123, 125, 127, 129, 131, 134, 136, 138, 140, 142, 144, 146,
+    148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174,
+    177, 180, 182, 185, 187, 190, 192, 195, 199, 202, 205, 208,
+    211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247,
+    250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292,
+    296, 300, 304, 309, 313, 317, 322, 326, 330, 335, 340, 344,
+    349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406,
+    411, 417, 423, 429, 435, 441, 447, 454, 461, 467, 475, 482,
+    489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590,
+    602, 614, 626, 640, 654, 668, 684, 700, 717, 736, 755, 775,
+    796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+    1184, 1232, 1282, 1336
+  },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Lookup table for 10 bit.
+  {
+    4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34,
+    37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75,
+    78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120,
+    124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, 166,
+    170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212,
+    215, 219, 223, 226, 230, 233, 237, 241, 244, 248, 251, 255,
+    259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297,
+    300, 304, 307, 310, 314, 317, 321, 324, 327, 331, 334, 337,
+    343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412,
+    418, 424, 430, 436, 442, 448, 454, 460, 466, 472, 478, 484,
+    490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, 584,
+    592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698,
+    708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831,
+    844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988,
+    1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170,
+    1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379,
+    1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624,
+    1647, 1670, 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929,
+    1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, 2236, 2276, 2319, 2363,
+    2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102,
+    3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559,
+    4737, 4929, 5130, 5347
+  },
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+};
+
+constexpr int16_t kAcLookup[][256] = {
+  // Lookup table for 8 bit.
+  {
+    4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+    31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+    43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+    55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+    67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
+    79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+    91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+    104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+    128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
+    152, 155, 158, 161, 164, 167, 170, 173, 176, 179, 182, 185,
+    188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227,
+    231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280,
+    285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347,
+    353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432,
+    440, 448, 456, 465, 474, 483, 492, 501, 510, 520, 530, 540,
+    550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676,
+    689, 702, 715, 729, 743, 757, 771, 786, 801, 816, 832, 848,
+    864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066,
+    1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+    1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692,
+    1725, 1759, 1793, 1828
+  },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Lookup table for 10 bit.
+  {
+    4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37,
+    40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83,
+    88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136,
+    140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, 190,
+    195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244,
+    249, 253, 258, 262, 267, 271, 275, 280, 284, 289, 293, 297,
+    302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349,
+    354, 358, 362, 367, 371, 375, 379, 384, 388, 392, 396, 401,
+    409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498,
+    506, 514, 523, 531, 539, 547, 555, 563, 571, 579, 588, 596,
+    604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, 737,
+    749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905,
+    922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118,
+    1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386,
+    1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727,
+    1759, 1791, 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159,
+    2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703,
+    2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391,
+    3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264,
+    4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372,
+    5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768,
+    6900, 7036, 7172, 7312
+  },
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+};
+// clang-format on
+
+void Transpose(uint8_t* const dst, const uint8_t* const src, int src_width,
+               int src_height) {
+  const int dst_width = src_height;
+  const int dst_height = src_width;
+  Array2DView<const uint8_t> source(src_height, src_width, src);
+  Array2DView<uint8_t> dest(dst_height, dst_width, dst);
+  for (int y = 0; y < dst_height; ++y) {
+    for (int x = 0; x < dst_width; ++x) {
+      dest[y][x] = source[x][y];
+    }
+  }
+}
+
+// Copies the lower triangle and fills the upper triangle of |dst| using |src|
+// as the source.
+void FillUpperTriangle(uint8_t* dst, const uint8_t* src, int size) {
+  Array2DView<uint8_t> dest(size, size, dst);
+  int k = 0;
+  for (int y = 0; y < size; ++y) {
+    for (int x = 0; x <= y; ++x) {
+      dest[y][x] = dest[x][y] = src[k++];
+    }
+  }
+}
+
+}  // namespace
+
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix_ptr) {
+  for (int level = 0; level < kNumQuantizerLevelsForQuantizerMatrix; ++level) {
+    for (int plane_type = kPlaneTypeY; plane_type < kNumPlaneTypes;
+         ++plane_type) {
+      auto& quantizer_matrix = (*quantizer_matrix_ptr)[level][plane_type];
+      // Notes about how these matrices are populated:
+      // * For square transforms, we store only the lower left triangle (it is
+      // symmetric about the main diagonal. So when populating the matrix, we
+      // will have to fill in the upper right triangle.
+      // * For rectangular transforms, the matrices are transposes when the
+      // width and height are reversed. So when populating we populate it with
+      // memcpy when w < h and populate it by transposing when w > h.
+      // * There is a special case for 16x16 where the matrix is the same as
+      // 32x32 with some offsets.
+      // * We use the "adjusted transform size" when using these matrices, so we
+      // won't have to populate them for transform sizes with one of the
+      // dimensions equal to 64.
+      for (int tx_size = 0; tx_size < kNumTransformSizes; ++tx_size) {
+        if (kTransformWidth[tx_size] == 64 || kTransformHeight[tx_size] == 64) {
+          continue;
+        }
+        const int size = kTransformWidth[tx_size] * kTransformHeight[tx_size];
+        if (!quantizer_matrix[tx_size].Resize(size)) {
+          return false;
+        }
+      }
+#define QUANTIZER_MEMCPY(W, H)                            \
+  memcpy(quantizer_matrix[kTransformSize##W##x##H].get(), \
+         kQuantizerMatrix##W##x##H[level][plane_type], (W) * (H))
+#define QUANTIZER_TRANSPOSE(W, H)                            \
+  Transpose(quantizer_matrix[kTransformSize##W##x##H].get(), \
+            kQuantizerMatrix##H##x##W[level][plane_type], H, W)
+#define QUANTIZER_FILL_UPPER_TRIANGLE(SIZE)                                \
+  FillUpperTriangle(quantizer_matrix[kTransformSize##SIZE##x##SIZE].get(), \
+                    kQuantizerMatrix##SIZE##x##SIZE[level][plane_type], SIZE)
+      QUANTIZER_FILL_UPPER_TRIANGLE(4);   // 4x4
+      QUANTIZER_MEMCPY(4, 8);             // 4x8
+      QUANTIZER_MEMCPY(4, 16);            // 4x16
+      QUANTIZER_TRANSPOSE(8, 4);          // 8x4
+      QUANTIZER_FILL_UPPER_TRIANGLE(8);   // 8x8
+      QUANTIZER_MEMCPY(8, 16);            // 8x16
+      QUANTIZER_MEMCPY(8, 32);            // 8x32
+      QUANTIZER_TRANSPOSE(16, 4);         // 16x4
+      QUANTIZER_TRANSPOSE(16, 8);         // 16x8
+      QUANTIZER_MEMCPY(16, 32);           // 16x32
+      QUANTIZER_TRANSPOSE(32, 8);         // 32x8
+      QUANTIZER_TRANSPOSE(32, 16);        // 32x16
+      QUANTIZER_FILL_UPPER_TRIANGLE(32);  // 32x32
+      // 16x16.
+      Array2DView<uint8_t> dst16x16(
+          16, 16, quantizer_matrix[kTransformSize16x16].get());
+      Array2DView<const uint8_t> src32x32(
+          32, 32, quantizer_matrix[kTransformSize32x32].get());
+      for (int y = 0; y < 16; ++y) {
+        for (int x = 0; x < 16; ++x) {
+          dst16x16[y][x] = src32x32[MultiplyBy2(y)][MultiplyBy2(x)];
+        }
+      }
+#undef QUANTIZER_FILL_UPPER_TRIANGLE
+#undef QUANTIZER_TRANSPOSE
+#undef QUANTIZER_MEMCPY
+    }
+  }
+  return true;
+}
+
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex) {
+  if (segmentation.FeatureActive(index, kSegmentFeatureQuantizer)) {
+    const int segment_qindex =
+        base_qindex +
+        segmentation.feature_data[index][kSegmentFeatureQuantizer];
+    return Clip3(segment_qindex, kMinQuantizer, kMaxQuantizer);
+  }
+  return base_qindex;
+}
+
+Quantizer::Quantizer(int bitdepth, const QuantizerParameters* params)
+    : params_(*params) {
+  assert(bitdepth >= 8 && bitdepth <= LIBGAV1_MAX_BITDEPTH);
+  const int index = BitdepthToArrayIndex(bitdepth);
+  dc_lookup_ = kDcLookup[index];
+  ac_lookup_ = kAcLookup[index];
+}
+
+int Quantizer::GetDcValue(Plane plane, int qindex) const {
+  return dc_lookup_[Clip3(qindex + params_.delta_dc[plane], kMinQuantizer,
+                          kMaxQuantizer)];
+}
+
+int Quantizer::GetAcValue(Plane plane, int qindex) const {
+  return ac_lookup_[Clip3(qindex + params_.delta_ac[plane], kMinQuantizer,
+                          kMaxQuantizer)];
+}
+
+}  // namespace libgav1
diff --git a/src/quantizer.h b/src/quantizer.h
new file mode 100644
index 0000000..00c53ab
--- /dev/null
+++ b/src/quantizer.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_QUANTIZER_H_
+#define LIBGAV1_SRC_QUANTIZER_H_
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+using QuantizerMatrix = std::array<
+    std::array<std::array<DynamicBuffer<uint8_t>, kNumTransformSizes>,
+               kNumPlaneTypes>,
+    kNumQuantizerLevelsForQuantizerMatrix>;
+
+// Implements the dequantization functions of Section 7.12.2.
+class Quantizer {
+ public:
+  Quantizer(int bitdepth, const QuantizerParameters* params);
+
+  // Returns the quantizer value for the dc coefficient for the given plane.
+  // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+  // the |base_qindex| argument, and pass the return value as the |qindex|
+  // argument to this method.
+  int GetDcValue(Plane plane, int qindex) const;
+
+  // Returns the quantizer value for the ac coefficient for the given plane.
+  // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+  // the |base_qindex| argument, and pass the return value as the |qindex|
+  // argument to this method.
+  int GetAcValue(Plane plane, int qindex) const;
+
+ private:
+  const QuantizerParameters& params_;
+  const int16_t* dc_lookup_;
+  const int16_t* ac_lookup_;
+};
+
+// Initialize the quantizer matrix.
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix);
+
+// Get the quantizer index for the |index|th segment.
+//
+// This function has two use cases. What should be passed as the |base_qindex|
+// argument depends on the use case.
+// 1. While parsing the uncompressed header or transform type, pass
+//    Quantizer::base_index.
+//    Note: In this use case, the caller only cares about whether the return
+//    value is zero.
+// 2. To generate the |qindex| argument to Quantizer::GetDcQuant() or
+//    Quantizer::GetAcQuant(), pass Tile::current_quantizer_index_.
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_QUANTIZER_H_
diff --git a/src/quantizer_tables.inc b/src/quantizer_tables.inc
new file mode 100644
index 0000000..34342c4
--- /dev/null
+++ b/src/quantizer_tables.inc
@@ -0,0 +1,3080 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the quantizer table
+// definitions from the quantizer functions.
+
+constexpr uint8_t kQuantizerMatrix4x8
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][32] = {
+        {{32,  42,  75, 91,  33,  42,  69,  86,  37,  58, 84,
+          91,  49,  71, 103, 110, 65,  84,  125, 128, 80, 97,
+          142, 152, 91, 100, 145, 178, 104, 112, 146, 190},
+         {31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64,  48, 61, 75, 73,
+          54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105}},
+        {{32,  42,  69, 88, 33,  42,  64, 83,  36,  56, 77,
+          88,  46,  67, 93, 105, 60,  79, 112, 122, 75, 92,
+          130, 144, 86, 95, 136, 167, 98, 105, 136, 177},
+         {31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72,
+          52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102}},
+        {{32,  38,  62, 86, 32,  40,  58, 80, 34,  51, 68,
+          85,  44,  61, 85, 101, 54,  69, 98, 117, 72, 84,
+          118, 136, 82, 89, 129, 157, 92, 98, 127, 165},
+         {31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71,
+          50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99}},
+        {{32,  35,  59, 83, 32,  36,  57, 78, 34,  47, 65,
+          82,  41,  53, 78, 97,  51,  61, 92, 111, 65, 73,
+          108, 129, 75, 81, 117, 148, 86, 92, 119, 154},
+         {31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70,
+          49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95}},
+        {{32, 35, 51, 77,  32, 36, 50, 72,  34, 42, 54,  75,  38, 51, 67,  87,
+          48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144},
+         {31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65,
+          47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93}},
+        {{32, 35, 51, 75, 32, 36, 50, 71,  34, 42, 54, 73,  37, 50, 65,  84,
+          45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136},
+         {31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64,
+          46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90}},
+        {{32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58,  35, 43, 54, 68,
+          41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111},
+         {31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57,
+          45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79}},
+        {{32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59,
+          38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97},
+         {31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53,
+          46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73}},
+        {{32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54,
+          35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83},
+         {31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50,
+          47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67}},
+        {{31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42,
+          34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67},
+         {31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45,
+          43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59}},
+        {{31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40,
+          33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56},
+         {31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46,
+          40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55}},
+        {{31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36,
+          32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48},
+         {31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47,
+          37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53}},
+        {{31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36},
+         {31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40,
+          34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47}},
+        {{31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+         {31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36,
+          31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix4x16
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][64] = {
+        {{31,  44,  79,  96,  32,  41,  72,  90,  32,  42,  71,  86,  34,
+          48,  73,  83,  34,  54,  78,  89,  41,  63,  90,  95,  45,  67,
+          96,  102, 54,  75,  110, 111, 60,  79,  118, 123, 72,  90,  133,
+          135, 75,  92,  136, 149, 83,  100, 142, 160, 88,  100, 140, 173,
+          94,  101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197},
+         {31, 49, 63, 69,  32, 45, 57, 65,  36, 46, 56, 62,  43, 49, 57, 60,
+          46, 53, 60, 63,  45, 58, 67, 66,  46, 59, 71, 70,  50, 62, 78, 74,
+          52, 64, 82, 80,  57, 67, 89, 85,  59, 68, 90, 91,  62, 71, 91, 96,
+          63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107}},
+        {{31,  44, 73,  93,  32,  41,  67,  87,  32,  42,  65,  83,  33,
+          44,  66, 81,  34,  54,  74,  86,  37,  58,  79,  92,  44,  66,
+          90,  98, 49,  71,  99,  107, 56,  77,  107, 117, 65,  84,  119,
+          129, 72, 90,  127, 141, 78,  95,  133, 151, 84,  95,  132, 163,
+          89,  95, 136, 169, 95,  101, 132, 175, 101, 108, 141, 183},
+         {31, 49, 61, 69, 32, 45, 55, 64,  36, 46, 54, 61,  41, 47, 54, 59,
+          46, 53, 59, 62, 46, 56, 62, 65,  46, 59, 68, 68,  48, 61, 73, 73,
+          51, 63, 77, 78, 54, 65, 82, 84,  57, 67, 86, 89,  60, 69, 88, 93,
+          62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103}},
+        {{31,  39, 65,  90,  32,  38,  60,  84,  32,  39,  59,  81,  33,
+          40,  58, 78,  34,  47,  65,  83,  37,  54,  73,  89,  41,  58,
+          79,  94, 46,  62,  86,  102, 53,  68,  97,  112, 60,  73,  105,
+          123, 65, 78,  111, 134, 74,  85,  120, 143, 79,  90,  125, 154,
+          84,  90, 128, 158, 89,  95,  124, 164, 94,  101, 131, 170},
+         {31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58,
+          44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71,
+          49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91,
+          60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100}},
+        {{31,  36, 62,  88,  32,  35, 58,  82,  32,  36,  57,  79,  33,
+          38,  56, 76,  34,  42,  61, 81,  34,  48,  66,  85,  39,  51,
+          74,  91, 44,  56,  82,  98, 49,  60,  90,  107, 54,  63,  95,
+          117, 60, 68,  102, 127, 68, 75,  110, 135, 75,  81,  117, 145,
+          79,  85, 120, 148, 84,  89, 116, 153, 88,  94,  123, 159},
+         {31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57,
+          43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70,
+          48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89,
+          58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97}},
+        {{31,  36, 53,  81,  32,  35, 51,  76,  32,  35, 49,  73,  32,
+          37,  49, 71,  33,  41,  53, 74,  34,  48,  60, 80,  37,  50,
+          65,  85, 41,  53,  71,  91, 45,  56,  76,  98, 49,  60,  82,
+          105, 54, 63,  87,  112, 61, 69,  93,  121, 68, 75,  100, 130,
+          74,  80, 105, 137, 78,  84, 109, 142, 83,  88, 114, 148},
+         {31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56,
+          42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68,
+          46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83,
+          56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94}},
+        {{31, 36, 53, 79,  32, 35, 51, 75,  32, 34, 49,  72,  32, 36, 50,  71,
+          33, 38, 49, 69,  34, 42, 54, 73,  34, 48, 60,  78,  37, 50, 65,  84,
+          41, 53, 71, 90,  45, 56, 76, 96,  49, 60, 82,  103, 54, 63, 87,  110,
+          60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136},
+         {31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56,
+          40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64,
+          45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78,
+          52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90}},
+        {{31, 34, 44, 65, 32, 34, 43, 62,  32, 33, 41, 59,  32, 35, 43, 59,
+          32, 37, 43, 58, 34, 39, 48, 63,  34, 42, 53, 67,  36, 44, 57, 71,
+          39, 46, 60, 76, 42, 48, 64, 81,  45, 51, 67, 85,  50, 54, 72, 92,
+          54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118},
+         {31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51,
+          40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58,
+          46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71,
+          50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82}},
+        {{31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53,
+          32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63,
+          37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79,
+          50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97},
+         {31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49,
+          37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55,
+          46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65,
+          48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73}},
+        {{31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49,
+          32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54,
+          35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71,
+          45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87},
+         {31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48,
+          35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50,
+          47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61,
+          47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68}},
+        {{31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41,
+          32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44,
+          34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58,
+          40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67},
+         {31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45,
+          34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47,
+          42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56,
+          47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59}},
+        {{31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37,
+          32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40,
+          32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51,
+          35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58},
+         {31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45,
+          31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46,
+          38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53,
+          48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56}},
+        {{31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35,
+          32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36,
+          32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41,
+          34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48},
+         {31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46,
+          31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47,
+          36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49,
+          42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53}},
+        {{31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33,
+          32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35,
+          32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37},
+         {31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38,
+          31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40,
+          33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45,
+          38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47}},
+        {{31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+          32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+         {31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35,
+          31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36,
+          31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37,
+          33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x16
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][128] = {
+        {{32,  32,  36,  53,  65,  87,  93,  99,  31,  33,  34,  49,  59,
+          78,  86,  93,  32,  34,  36,  50,  59,  77,  82,  89,  34,  37,
+          42,  54,  63,  79,  80,  88,  36,  38,  48,  60,  68,  84,  86,
+          90,  44,  43,  53,  71,  79,  95,  94,  97,  48,  46,  56,  76,
+          85,  102, 105, 105, 58,  54,  63,  87,  98,  116, 112, 115, 65,
+          58,  68,  92,  105, 124, 122, 124, 79,  70,  79,  104, 118, 141,
+          135, 135, 82,  72,  81,  106, 121, 144, 149, 146, 91,  80,  88,
+          106, 130, 148, 162, 159, 97,  86,  94,  107, 128, 157, 167, 171,
+          103, 93,  98,  114, 131, 150, 174, 186, 110, 100, 101, 117, 138,
+          161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203},
+         {32, 37, 48, 52, 57, 66, 68,  71,  30, 40, 46, 48, 52, 60, 63,  66,
+          33, 43, 47, 47, 51, 59, 60,  63,  42, 47, 50, 50, 53, 60, 59,  62,
+          49, 48, 53, 54, 57, 62, 62,  62,  49, 46, 53, 61, 64, 69, 66,  66,
+          50, 46, 54, 64, 67, 73, 72,  70,  54, 49, 55, 68, 73, 80, 76,  75,
+          57, 50, 56, 70, 76, 84, 80,  79,  63, 55, 60, 75, 82, 92, 87,  84,
+          64, 56, 61, 75, 83, 93, 93,  89,  68, 59, 64, 74, 86, 94, 98,  94,
+          70, 62, 66, 73, 83, 96, 99,  98,  72, 64, 66, 75, 83, 92, 101, 104,
+          74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109}},
+        {{32,  32,  36,  47,  65,  79,  90,  96,  31,  32,  35,  44,  60,
+          72,  84,  90,  32,  34,  36,  45,  59,  71,  80,  87,  32,  35,
+          40,  47,  60,  71,  78,  85,  36,  37,  48,  56,  68,  78,  83,
+          87,  39,  40,  50,  60,  73,  84,  91,  94,  47,  45,  56,  69,
+          84,  95,  101, 101, 53,  50,  60,  75,  92,  103, 108, 110, 61,
+          56,  65,  81,  100, 113, 116, 118, 71,  64,  73,  89,  111, 125,
+          129, 129, 79,  70,  79,  95,  118, 133, 142, 138, 86,  76,  84,
+          100, 124, 140, 153, 150, 92,  82,  89,  101, 121, 148, 157, 161,
+          98,  88,  93,  108, 124, 141, 163, 174, 104, 94,  95,  110, 129,
+          151, 171, 181, 110, 100, 98,  111, 127, 147, 169, 188},
+         {32, 35, 48, 50, 57, 63, 68,  70,  30, 38, 46, 46, 52, 58, 63, 65,
+          33, 41, 47, 46, 51, 56, 60,  63,  39, 46, 48, 47, 51, 55, 58, 61,
+          49, 48, 53, 54, 57, 60, 61,  61,  48, 46, 53, 56, 60, 64, 65, 65,
+          50, 46, 54, 61, 66, 70, 71,  69,  52, 47, 54, 63, 71, 75, 75, 74,
+          55, 49, 56, 65, 74, 79, 79,  78,  60, 53, 58, 68, 79, 85, 85, 82,
+          63, 55, 60, 70, 82, 89, 91,  87,  66, 58, 62, 72, 84, 91, 95, 91,
+          68, 60, 64, 71, 81, 94, 97,  96,  70, 62, 65, 73, 81, 89, 98, 101,
+          72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105}},
+        {{32,  32,  36,  44,  58,  79,  88,  93,  31,  32,  35,  41,  54,
+          73,  81,  88,  32,  33,  36,  42,  53,  71,  78,  84,  32,  34,
+          38,  42,  52,  69,  76,  82,  34,  36,  44,  50,  59,  75,  81,
+          84,  39,  39,  50,  58,  68,  84,  88,  90,  44,  42,  53,  63,
+          74,  90,  97,  97,  49,  46,  57,  67,  81,  97,  104, 105, 57,
+          53,  63,  74,  90,  108, 111, 113, 65,  59,  68,  79,  97,  118,
+          123, 122, 71,  64,  73,  84,  102, 125, 135, 131, 81,  72,  80,
+          91,  110, 135, 145, 141, 87,  77,  85,  96,  114, 140, 148, 151,
+          92,  83,  88,  102, 117, 133, 153, 163, 98,  88,  89,  103, 121,
+          141, 160, 169, 103, 94,  92,  103, 119, 137, 158, 175},
+         {32, 34, 48, 49, 54, 63, 67, 69,  31, 36, 46, 46, 50, 58, 62, 65,
+          33, 40, 47, 46, 49, 56, 59, 62,  37, 44, 47, 45, 48, 54, 57, 60,
+          44, 46, 51, 51, 53, 59, 60, 61,  48, 46, 53, 56, 58, 64, 64, 64,
+          49, 45, 53, 58, 62, 67, 70, 68,  51, 47, 54, 60, 65, 71, 73, 72,
+          54, 49, 55, 62, 70, 77, 77, 76,  57, 51, 56, 64, 73, 82, 83, 81,
+          60, 53, 58, 65, 75, 85, 89, 85,  64, 57, 61, 68, 78, 89, 93, 89,
+          66, 59, 63, 69, 79, 91, 94, 93,  68, 61, 63, 71, 79, 87, 96, 98,
+          70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102}},
+        {{32, 31, 35,  44,  53,  65,  82,  90, 31, 32, 34,  41,  50,  61,  76,
+          85, 31, 33,  35,  42,  49,  59,  73, 81, 32, 34,  37,  42,  49,  58,
+          71, 79, 34,  35,  41,  48,  54,  63, 76, 81, 36,  36,  46,  54,  60,
+          68, 80, 87,  41,  40,  49,  60,  67, 76, 88, 93,  47,  44,  53,  66,
+          75, 84, 97,  101, 53,  50,  57,  71, 82, 92, 106, 108, 58,  54,  61,
+          75, 87, 98,  112, 116, 65,  59,  66, 79, 92, 105, 120, 124, 74,  67,
+          73, 86, 100, 113, 131, 134, 82,  73, 79, 92, 105, 120, 139, 142, 87,
+          78, 83, 96,  110, 125, 144, 153, 92, 83, 84, 97,  114, 132, 150, 157,
+          97, 88, 86,  97,  111, 128, 147, 163},
+         {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64,
+          33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59,
+          42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63,
+          48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71,
+          52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79,
+          57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87,
+          64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95,
+          68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98}},
+        {{32, 31, 33, 40,  51,  65,  79,  87, 31, 32, 33, 39,  49,  61,  74,
+          82, 31, 32, 34,  38,  47,  59,  71, 79, 32, 33, 36,  40,  48,  58,
+          69, 77, 33, 34,  38,  44,  52,  62, 72, 78, 36, 35,  42,  51,  58,
+          68, 78, 84, 39,  38,  44,  54,  63, 73, 84, 89, 44,  41,  46,  59,
+          69, 79, 90, 96,  48,  45,  50,  62, 74, 85, 96, 103, 53,  49,  53,
+          66, 79, 92, 103, 111, 58,  54,  57, 70, 84, 98, 110, 118, 66,  60,
+          63, 75, 90, 106, 119, 126, 74,  67, 69, 81, 97, 113, 128, 134, 81,
+          73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147,
+          91, 82, 80, 90,  103, 119, 137, 151},
+         {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63,
+          31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58,
+          41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62,
+          48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69,
+          50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77,
+          54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85,
+          61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92,
+          66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95}},
+        {{32, 31, 32, 36, 44, 53,  65,  79,  31, 32, 32, 35, 42, 51,  62,  75,
+          31, 32, 33, 34, 41, 49,  59,  72,  32, 32, 34, 36, 42, 50,  59,  71,
+          32, 33, 35, 38, 42, 49,  58,  69,  34, 34, 37, 42, 48, 54,  63,  73,
+          36, 34, 38, 48, 54, 60,  68,  78,  39, 37, 40, 50, 58, 65,  73,  84,
+          44, 41, 43, 53, 63, 71,  79,  90,  48, 45, 46, 56, 67, 76,  85,  96,
+          53, 49, 50, 60, 71, 82,  92,  103, 58, 54, 54, 63, 75, 87,  98,  110,
+          65, 60, 58, 68, 79, 92,  105, 118, 71, 65, 63, 73, 84, 97,  111, 125,
+          79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136},
+         {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+          30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+          37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+          49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+          49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+          52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+          57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+          63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90}},
+        {{32, 31, 32, 36, 44, 53, 62,  73,  31, 32, 32, 35, 42, 51,  59,  69,
+          31, 32, 33, 34, 41, 49, 57,  66,  32, 32, 34, 36, 42, 50,  57,  65,
+          32, 33, 35, 38, 42, 49, 56,  64,  34, 34, 37, 42, 48, 54,  61,  69,
+          35, 34, 38, 47, 52, 59, 65,  73,  38, 36, 40, 49, 56, 63,  69,  77,
+          41, 39, 41, 51, 60, 67, 74,  81,  44, 42, 43, 54, 64, 72,  79,  86,
+          48, 45, 46, 56, 67, 76, 83,  91,  53, 49, 50, 60, 71, 82,  90,  99,
+          58, 54, 54, 63, 75, 87, 95,  105, 65, 60, 58, 68, 79, 92,  102, 112,
+          71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127},
+         {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57,
+          30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54,
+          37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56,
+          47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61,
+          48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66,
+          50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73,
+          54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79,
+          60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86}},
+        {{32, 31, 32, 35, 39, 44, 53, 65,  31, 32, 32, 35, 38, 42, 51, 62,
+          31, 32, 33, 34, 37, 41, 49, 59,  31, 32, 34, 35, 38, 42, 49, 59,
+          32, 32, 34, 36, 39, 42, 49, 58,  32, 33, 35, 37, 40, 42, 49, 58,
+          34, 34, 37, 41, 44, 48, 54, 63,  36, 34, 38, 46, 50, 54, 60, 68,
+          38, 37, 40, 47, 52, 57, 64, 72,  41, 39, 41, 49, 54, 60, 67, 76,
+          44, 41, 43, 51, 57, 63, 71, 79,  48, 45, 46, 54, 60, 67, 76, 85,
+          53, 49, 50, 57, 64, 71, 82, 92,  57, 53, 53, 60, 67, 74, 86, 97,
+          61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105},
+         {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54,
+          30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51,
+          35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+          42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57,
+          48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61,
+          49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67,
+          52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+          55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76}},
+        {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51,
+          31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49,
+          32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49,
+          32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+          36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63,
+          39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71,
+          47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77,
+          53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87},
+         {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50,
+          30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48,
+          33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+          39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+          49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56,
+          48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61,
+          50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64,
+          52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68}},
+        {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45,
+          31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+          31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+          32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47,
+          34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55,
+          36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60,
+          41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+          47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70},
+         {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48,
+          31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+          33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+          37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47,
+          42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53,
+          49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56,
+          48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+          50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61}},
+        {{32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48,
+          34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54,
+          36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58,
+          39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63},
+         {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47,
+          31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45,
+          30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46,
+          33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49,
+          42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53,
+          49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56,
+          48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+          31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41,
+          34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+          36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48},
+         {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+          33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49,
+          42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38},
+         {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41,
+          30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41,
+          33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+         {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36,
+          31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39,
+          33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][256] = {
+        {{32,  32,  36,  53,  65,  87,  93,  99,  31,  32,  35,  51,  62,  82,
+          88,  94,  31,  33,  34,  49,  59,  78,  86,  93,  31,  33,  35,  49,
+          59,  78,  84,  90,  32,  34,  36,  50,  59,  77,  82,  89,  32,  35,
+          38,  49,  58,  75,  82,  89,  34,  37,  42,  54,  63,  79,  80,  88,
+          35,  37,  45,  57,  65,  82,  84,  87,  36,  38,  48,  60,  68,  84,
+          86,  90,  39,  40,  50,  65,  73,  89,  91,  93,  44,  43,  53,  71,
+          79,  95,  94,  97,  46,  44,  55,  73,  82,  98,  98,  99,  48,  46,
+          56,  76,  85,  102, 105, 105, 53,  50,  60,  82,  92,  109, 107, 107,
+          58,  54,  63,  87,  98,  116, 112, 115, 61,  56,  66,  89,  101, 120,
+          119, 116, 65,  58,  68,  92,  105, 124, 122, 124, 71,  63,  73,  97,
+          111, 132, 130, 127, 79,  70,  79,  104, 118, 141, 135, 135, 81,  71,
+          80,  105, 119, 142, 140, 139, 82,  72,  81,  106, 121, 144, 149, 146,
+          88,  77,  85,  108, 126, 149, 153, 152, 91,  80,  88,  106, 130, 148,
+          162, 159, 94,  83,  91,  105, 131, 153, 165, 166, 97,  86,  94,  107,
+          128, 157, 167, 171, 100, 89,  97,  111, 127, 152, 173, 182, 103, 93,
+          98,  114, 131, 150, 174, 186, 107, 96,  100, 117, 136, 155, 177, 191,
+          110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159,
+          185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119,
+          136, 156, 179, 204},
+         {32, 37, 48, 52, 57, 66, 68,  71,  31, 38, 47, 50, 54, 63, 65,  67,
+          30, 40, 46, 48, 52, 60, 63,  66,  32, 41, 46, 48, 51, 59, 62,  64,
+          33, 43, 47, 47, 51, 59, 60,  63,  37, 47, 47, 47, 50, 57, 60,  62,
+          42, 47, 50, 50, 53, 60, 59,  62,  45, 47, 51, 52, 55, 61, 61,  61,
+          49, 48, 53, 54, 57, 62, 62,  62,  48, 47, 53, 57, 60, 66, 65,  64,
+          49, 46, 53, 61, 64, 69, 66,  66,  49, 46, 53, 62, 65, 71, 68,  67,
+          50, 46, 54, 64, 67, 73, 72,  70,  52, 47, 54, 66, 71, 77, 73,  71,
+          54, 49, 55, 68, 73, 80, 76,  75,  55, 49, 56, 69, 75, 82, 79,  76,
+          57, 50, 56, 70, 76, 84, 80,  79,  60, 52, 58, 72, 79, 88, 84,  81,
+          63, 55, 60, 75, 82, 92, 87,  84,  64, 55, 61, 75, 82, 92, 89,  86,
+          64, 56, 61, 75, 83, 93, 93,  89,  67, 58, 63, 76, 85, 95, 94,  91,
+          68, 59, 64, 74, 86, 94, 98,  94,  69, 60, 65, 72, 85, 95, 99,  97,
+          70, 62, 66, 73, 83, 96, 99,  98,  71, 63, 67, 74, 82, 93, 102, 102,
+          72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106,
+          74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109,
+          76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99,  108}},
+        {{32,  32,  36,  47,  65,  79,  90,  96,  31,  32,  35,  45,  62,  75,
+          86,  91,  31,  32,  35,  44,  60,  72,  84,  90,  31,  33,  35,  44,
+          59,  71,  82,  87,  32,  34,  36,  45,  59,  71,  80,  87,  32,  35,
+          38,  45,  58,  69,  80,  86,  32,  35,  40,  47,  60,  71,  78,  85,
+          34,  36,  42,  50,  63,  73,  82,  84,  36,  37,  48,  56,  68,  78,
+          83,  87,  38,  39,  49,  58,  71,  81,  88,  90,  39,  40,  50,  60,
+          73,  84,  91,  94,  44,  42,  53,  66,  79,  90,  94,  96,  47,  45,
+          56,  69,  84,  95,  101, 101, 49,  47,  57,  71,  86,  97,  103, 102,
+          53,  50,  60,  75,  92,  103, 108, 110, 58,  54,  63,  79,  98,  110,
+          114, 111, 61,  56,  65,  81,  100, 113, 116, 118, 65,  59,  68,  84,
+          105, 118, 124, 121, 71,  64,  73,  89,  111, 125, 129, 129, 76,  68,
+          76,  92,  115, 130, 134, 132, 79,  70,  79,  95,  118, 133, 142, 138,
+          82,  73,  81,  97,  121, 136, 145, 144, 86,  76,  84,  100, 124, 140,
+          153, 150, 89,  79,  87,  99,  124, 145, 156, 156, 92,  82,  89,  101,
+          121, 148, 157, 161, 95,  85,  92,  105, 120, 143, 163, 171, 98,  88,
+          93,  108, 124, 141, 163, 174, 101, 91,  94,  110, 128, 146, 166, 179,
+          104, 94,  95,  110, 129, 151, 171, 181, 107, 97,  96,  110, 128, 149,
+          173, 188, 110, 100, 98,  111, 127, 147, 169, 188, 114, 104, 100, 111,
+          127, 145, 166, 190},
+         {32, 35, 48, 50, 57, 63, 68,  70,  31, 37, 47, 48, 54, 60, 64,  66,
+          30, 38, 46, 46, 52, 58, 63,  65,  31, 38, 46, 46, 52, 57, 61,  63,
+          33, 41, 47, 46, 51, 56, 60,  63,  37, 45, 47, 46, 50, 54, 59,  62,
+          39, 46, 48, 47, 51, 55, 58,  61,  42, 46, 50, 50, 53, 57, 60,  60,
+          49, 48, 53, 54, 57, 60, 61,  61,  48, 47, 53, 55, 58, 62, 64,  63,
+          48, 46, 53, 56, 60, 64, 65,  65,  49, 45, 53, 59, 64, 67, 67,  66,
+          50, 46, 54, 61, 66, 70, 71,  69,  51, 47, 54, 61, 68, 71, 72,  70,
+          52, 47, 54, 63, 71, 75, 75,  74,  54, 49, 55, 65, 73, 78, 78,  74,
+          55, 49, 56, 65, 74, 79, 79,  78,  57, 50, 56, 66, 76, 82, 83,  79,
+          60, 53, 58, 68, 79, 85, 85,  82,  62, 54, 60, 69, 81, 87, 87,  84,
+          63, 55, 60, 70, 82, 89, 91,  87,  64, 56, 61, 71, 83, 90, 92,  89,
+          66, 58, 62, 72, 84, 91, 95,  91,  67, 59, 63, 71, 83, 93, 96,  94,
+          68, 60, 64, 71, 81, 94, 97,  96,  69, 61, 65, 72, 80, 91, 99,  100,
+          70, 62, 65, 73, 81, 89, 98,  101, 71, 64, 65, 73, 82, 90, 99,  103,
+          72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105,
+          74, 67, 65, 71, 79, 89, 98,  105, 75, 68, 65, 71, 78, 87, 96,  105}},
+        {{32,  32,  36,  44,  58,  79,  88,  93,  31,  32,  35,  42,  55,  75,
+          83,  88,  31,  32,  35,  41,  54,  73,  81,  88,  31,  32,  34,  41,
+          53,  72,  79,  84,  32,  33,  36,  42,  53,  71,  78,  84,  32,  34,
+          37,  42,  53,  70,  77,  83,  32,  34,  38,  42,  52,  69,  76,  82,
+          34,  35,  42,  48,  57,  73,  79,  81,  34,  36,  44,  50,  59,  75,
+          81,  84,  36,  37,  48,  54,  63,  78,  85,  86,  39,  39,  50,  58,
+          68,  84,  88,  90,  40,  40,  51,  59,  70,  85,  91,  92,  44,  42,
+          53,  63,  74,  90,  97,  97,  47,  45,  56,  66,  79,  95,  99,  98,
+          49,  46,  57,  67,  81,  97,  104, 105, 53,  50,  60,  71,  86,  103,
+          109, 106, 57,  53,  63,  74,  90,  108, 111, 113, 59,  54,  64,  75,
+          91,  111, 119, 115, 65,  59,  68,  79,  97,  118, 123, 122, 69,  62,
+          71,  83,  100, 122, 127, 125, 71,  64,  73,  84,  102, 125, 135, 131,
+          79,  71,  79,  90,  109, 133, 137, 136, 81,  72,  80,  91,  110, 135,
+          145, 141, 82,  73,  81,  92,  111, 136, 147, 147, 87,  77,  85,  96,
+          114, 140, 148, 151, 90,  80,  87,  99,  113, 135, 153, 160, 92,  83,
+          88,  102, 117, 133, 153, 163, 95,  85,  88,  103, 120, 137, 155, 168,
+          98,  88,  89,  103, 121, 141, 160, 169, 100, 91,  90,  103, 120, 139,
+          161, 175, 103, 94,  92,  103, 119, 137, 158, 175, 106, 97,  93,  104,
+          118, 135, 155, 176},
+         {32, 34, 48, 49, 54, 63, 67, 69,  31, 35, 47, 47, 51, 60, 63, 65,
+          31, 36, 46, 46, 50, 58, 62, 65,  30, 36, 46, 45, 49, 57, 60, 62,
+          33, 40, 47, 46, 49, 56, 59, 62,  35, 42, 47, 45, 48, 55, 58, 61,
+          37, 44, 47, 45, 48, 54, 57, 60,  42, 45, 50, 49, 51, 57, 59, 59,
+          44, 46, 51, 51, 53, 59, 60, 61,  49, 47, 53, 53, 55, 60, 63, 62,
+          48, 46, 53, 56, 58, 64, 64, 64,  48, 46, 53, 56, 59, 65, 66, 65,
+          49, 45, 53, 58, 62, 67, 70, 68,  50, 46, 54, 59, 65, 70, 70, 68,
+          51, 47, 54, 60, 65, 71, 73, 72,  52, 47, 54, 61, 68, 75, 76, 73,
+          54, 49, 55, 62, 70, 77, 77, 76,  54, 49, 55, 62, 70, 78, 81, 77,
+          57, 51, 56, 64, 73, 82, 83, 81,  59, 52, 58, 65, 74, 84, 85, 82,
+          60, 53, 58, 65, 75, 85, 89, 85,  63, 56, 60, 67, 77, 89, 90, 87,
+          64, 57, 61, 68, 78, 89, 93, 89,  64, 57, 61, 68, 78, 90, 94, 92,
+          66, 59, 63, 69, 79, 91, 94, 93,  67, 60, 63, 70, 78, 88, 96, 97,
+          68, 61, 63, 71, 79, 87, 96, 98,  69, 62, 63, 71, 80, 88, 96, 100,
+          70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102,
+          72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101}},
+        {{32,  31,  35,  44,  53,  65,  82,  90,  31,  32,  35,  42,  51,  62,
+          78,  86,  31,  32,  34,  41,  50,  61,  76,  85,  31,  32,  34,  41,
+          49,  59,  74,  82,  31,  33,  35,  42,  49,  59,  73,  81,  32,  33,
+          36,  42,  50,  59,  73,  80,  32,  34,  37,  42,  49,  58,  71,  79,
+          32,  34,  39,  44,  51,  60,  73,  78,  34,  35,  41,  48,  54,  63,
+          76,  81,  35,  36,  45,  52,  59,  67,  79,  83,  36,  36,  46,  54,
+          60,  68,  80,  87,  39,  39,  48,  58,  65,  73,  86,  88,  41,  40,
+          49,  60,  67,  76,  88,  93,  44,  42,  51,  63,  71,  79,  92,  94,
+          47,  44,  53,  66,  75,  84,  97,  101, 48,  45,  54,  67,  76,  85,
+          98,  101, 53,  50,  57,  71,  82,  92,  106, 108, 55,  51,  59,  72,
+          84,  94,  108, 110, 58,  54,  61,  75,  87,  98,  112, 116, 63,  58,
+          65,  78,  91,  103, 118, 119, 65,  59,  66,  79,  92,  105, 120, 124,
+          71,  64,  71,  84,  97,  111, 127, 129, 74,  67,  73,  86,  100, 113,
+          131, 134, 79,  71,  77,  90,  104, 118, 136, 139, 82,  73,  79,  92,
+          105, 120, 139, 142, 82,  74,  79,  92,  106, 121, 139, 150, 87,  78,
+          83,  96,  110, 125, 144, 153, 89,  81,  83,  97,  113, 128, 145, 157,
+          92,  83,  84,  97,  114, 132, 150, 157, 94,  85,  85,  97,  112, 130,
+          151, 163, 97,  88,  86,  97,  111, 128, 147, 163, 99,  91,  87,  97,
+          110, 126, 144, 163},
+         {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64,
+          31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61,
+          33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60,
+          37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58,
+          42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61,
+          49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64,
+          48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67,
+          50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71,
+          52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+          54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80,
+          57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85,
+          61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89,
+          64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94,
+          66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97,
+          68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99,
+          70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98}},
+        {{32,  31,  33,  40,  51,  65,  79,  87,  31,  32,  33,  39,  49,  62,
+          75,  83,  31,  32,  33,  39,  49,  61,  74,  82,  31,  32,  33,  38,
+          47,  59,  72,  79,  31,  32,  34,  38,  47,  59,  71,  79,  32,  33,
+          35,  39,  48,  59,  71,  78,  32,  33,  36,  40,  48,  58,  69,  77,
+          32,  33,  36,  41,  48,  58,  69,  75,  33,  34,  38,  44,  52,  62,
+          72,  78,  34,  34,  39,  45,  53,  63,  73,  80,  36,  35,  42,  51,
+          58,  68,  78,  84,  36,  35,  42,  51,  59,  68,  79,  85,  39,  38,
+          44,  54,  63,  73,  84,  89,  40,  39,  45,  56,  65,  75,  85,  90,
+          44,  41,  46,  59,  69,  79,  90,  96,  46,  43,  48,  60,  72,  82,
+          93,  97,  48,  45,  50,  62,  74,  85,  96,  103, 52,  48,  52,  65,
+          78,  90,  101, 105, 53,  49,  53,  66,  79,  92,  103, 111, 58,  53,
+          57,  69,  83,  97,  109, 113, 58,  54,  57,  70,  84,  98,  110, 118,
+          65,  59,  62,  74,  89,  105, 118, 122, 66,  60,  63,  75,  90,  106,
+          119, 126, 71,  65,  67,  79,  94,  111, 125, 131, 74,  67,  69,  81,
+          97,  113, 128, 134, 79,  72,  73,  85,  101, 118, 133, 141, 81,  73,
+          75,  86,  102, 120, 135, 143, 82,  74,  75,  87,  103, 121, 136, 147,
+          86,  78,  78,  90,  106, 124, 140, 147, 88,  80,  80,  90,  105, 122,
+          140, 152, 91,  82,  80,  90,  103, 119, 137, 151, 93,  85,  81,  90,
+          103, 117, 134, 152},
+         {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63,
+          31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60,
+          31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59,
+          35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57,
+          41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60,
+          49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63,
+          48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66,
+          49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70,
+          50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+          52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78,
+          54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83,
+          57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87,
+          61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92,
+          64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94,
+          66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96,
+          68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95}},
+        {{32, 31, 32, 36, 44, 53,  65,  79,  31, 32, 32, 35, 42, 51,  62,  75,
+          31, 32, 32, 35, 42, 51,  62,  75,  31, 32, 33, 34, 41, 49,  59,  72,
+          31, 32, 33, 34, 41, 49,  59,  72,  32, 32, 34, 36, 42, 50,  59,  71,
+          32, 32, 34, 36, 42, 50,  59,  71,  32, 33, 35, 38, 42, 49,  58,  69,
+          32, 33, 35, 38, 42, 49,  58,  69,  34, 34, 37, 42, 48, 54,  63,  73,
+          34, 34, 37, 42, 48, 54,  63,  73,  36, 34, 38, 48, 54, 60,  68,  78,
+          36, 34, 38, 48, 54, 60,  68,  78,  39, 37, 40, 50, 58, 65,  73,  84,
+          39, 37, 40, 50, 58, 65,  73,  84,  44, 41, 43, 53, 63, 71,  79,  90,
+          44, 41, 43, 53, 63, 71,  79,  90,  48, 45, 46, 56, 67, 76,  85,  96,
+          48, 45, 46, 56, 67, 76,  85,  96,  53, 49, 50, 60, 71, 82,  92,  103,
+          53, 49, 50, 60, 71, 82,  92,  103, 58, 54, 54, 63, 75, 87,  98,  110,
+          58, 54, 54, 63, 75, 87,  98,  110, 65, 60, 58, 68, 79, 92,  105, 118,
+          65, 60, 58, 68, 79, 92,  105, 118, 71, 65, 63, 73, 84, 97,  111, 125,
+          71, 65, 63, 73, 84, 97,  111, 125, 79, 72, 70, 79, 90, 104, 118, 133,
+          79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
+          82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141},
+         {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+          31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57,
+          30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+          33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54,
+          37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+          42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60,
+          49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+          48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+          49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+          50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75,
+          52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+          54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82,
+          57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+          60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89,
+          63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90,
+          64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92}},
+        {{32, 31, 32, 36, 44, 53,  62,  73,  31, 32, 32, 35, 42, 51,  60,  70,
+          31, 32, 32, 35, 42, 51,  59,  69,  31, 32, 32, 35, 41, 50,  58,  67,
+          31, 32, 33, 34, 41, 49,  57,  66,  31, 32, 33, 35, 41, 49,  57,  66,
+          32, 32, 34, 36, 42, 50,  57,  65,  32, 32, 34, 37, 42, 49,  56,  65,
+          32, 33, 35, 38, 42, 49,  56,  64,  32, 33, 35, 39, 43, 50,  56,  64,
+          34, 34, 37, 42, 48, 54,  61,  69,  34, 34, 37, 42, 48, 54,  61,  69,
+          35, 34, 38, 47, 52, 59,  65,  73,  36, 34, 38, 48, 54, 60,  66,  74,
+          38, 36, 40, 49, 56, 63,  69,  77,  39, 37, 40, 50, 58, 65,  71,  79,
+          41, 39, 41, 51, 60, 67,  74,  81,  44, 41, 43, 53, 63, 71,  78,  85,
+          44, 42, 43, 54, 64, 72,  79,  86,  48, 45, 46, 56, 67, 76,  83,  91,
+          48, 45, 46, 56, 67, 76,  83,  91,  53, 49, 49, 59, 71, 81,  89,  98,
+          53, 49, 50, 60, 71, 82,  90,  99,  57, 52, 52, 62, 74, 85,  94,  103,
+          58, 54, 54, 63, 75, 87,  95,  105, 61, 57, 56, 66, 77, 89,  98,  108,
+          65, 60, 58, 68, 79, 92,  102, 112, 67, 61, 60, 69, 81, 94,  103, 114,
+          71, 65, 63, 73, 84, 97,  108, 119, 72, 66, 64, 73, 85, 98,  108, 119,
+          79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127},
+         {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58,
+          31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56,
+          30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54,
+          33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53,
+          37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53,
+          42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56,
+          47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59,
+          48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62,
+          48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+          49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69,
+          50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72,
+          52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75,
+          54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78,
+          57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80,
+          60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83,
+          63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86}},
+        {{32, 31, 32, 35, 39, 44, 53, 65,  31, 32, 32, 35, 38, 42, 52, 63,
+          31, 32, 32, 35, 38, 42, 51, 62,  31, 32, 32, 34, 37, 41, 50, 61,
+          31, 32, 33, 34, 37, 41, 49, 59,  31, 32, 33, 34, 37, 41, 49, 59,
+          31, 32, 34, 35, 38, 42, 49, 59,  32, 32, 34, 36, 38, 42, 50, 59,
+          32, 32, 34, 36, 39, 42, 49, 58,  32, 33, 35, 37, 40, 42, 49, 58,
+          32, 33, 35, 37, 40, 42, 49, 58,  33, 33, 36, 40, 43, 46, 53, 62,
+          34, 34, 37, 41, 44, 48, 54, 63,  34, 34, 37, 43, 46, 50, 56, 65,
+          36, 34, 38, 46, 50, 54, 60, 68,  36, 34, 38, 46, 50, 54, 60, 68,
+          38, 37, 40, 47, 52, 57, 64, 72,  39, 37, 40, 48, 53, 58, 65, 73,
+          41, 39, 41, 49, 54, 60, 67, 76,  44, 41, 43, 51, 57, 63, 71, 79,
+          44, 41, 43, 51, 57, 63, 71, 79,  47, 44, 45, 53, 59, 66, 75, 84,
+          48, 45, 46, 54, 60, 67, 76, 85,  50, 46, 47, 55, 61, 68, 78, 88,
+          53, 49, 50, 57, 64, 71, 82, 92,  53, 49, 50, 57, 64, 71, 82, 92,
+          57, 53, 53, 60, 67, 74, 86, 97,  58, 54, 54, 61, 68, 75, 87, 98,
+          61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105,
+          65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109},
+         {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55,
+          31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53,
+          30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52,
+          33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51,
+          35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+          37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52,
+          42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54,
+          49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57,
+          48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+          48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64,
+          49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66,
+          50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68,
+          52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71,
+          54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73,
+          55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76,
+          57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78}},
+        {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52,
+          31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51,
+          31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49,
+          31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49,
+          32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49,
+          32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49,
+          32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+          34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57,
+          36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+          38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65,
+          39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69,
+          44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72,
+          47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76,
+          49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81,
+          53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83,
+          58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87},
+         {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50,
+          31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50,
+          30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48,
+          31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47,
+          33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47,
+          37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+          39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+          42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52,
+          49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+          48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57,
+          48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60,
+          49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61,
+          50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64,
+          51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66,
+          52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67,
+          54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68}},
+        {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46,
+          31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45,
+          31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+          31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44,
+          31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+          32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45,
+          32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45,
+          32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+          34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+          35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56,
+          36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58,
+          39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60,
+          41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+          44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67,
+          47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70,
+          48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73},
+         {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49,
+          31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48,
+          31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+          30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46,
+          33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+          33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46,
+          37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46,
+          39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+          42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+          47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54,
+          49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55,
+          48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56,
+          48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+          49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60,
+          50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61,
+          50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62}},
+        {{32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+          34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48,
+          34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50,
+          36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54,
+          36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56,
+          39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58,
+          39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60,
+          44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63},
+         {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48,
+          31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47,
+          31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46,
+          30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+          30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45,
+          33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46,
+          33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+          42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49,
+          42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51,
+          49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53,
+          49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54,
+          48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56,
+          48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57,
+          49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35,
+          31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40,
+          33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42,
+          34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42,
+          34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47,
+          36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48,
+          36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49},
+         {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46,
+          33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48,
+          41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50,
+          42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50,
+          44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33,
+          31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34,
+          31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35,
+          32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37,
+          34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38},
+         {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39,
+          31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40,
+          30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+          30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+          31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42,
+          33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45,
+          35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47,
+          42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+         {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35,
+          31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36,
+          31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37,
+          30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38,
+          31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40,
+          33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+          31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+          31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32,
+          30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32,
+          30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix16x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][512] = {
+        {{32,  31,  32,  34,  36,  44,  53,  59,  65,  79,  87,  90,  93,  96,
+          99,  102, 31,  32,  32,  34,  35,  42,  51,  56,  62,  75,  82,  85,
+          88,  91,  94,  97,  31,  32,  33,  33,  34,  41,  49,  54,  59,  72,
+          78,  82,  86,  90,  93,  97,  31,  32,  33,  34,  35,  41,  49,  54,
+          59,  71,  78,  81,  84,  87,  90,  93,  32,  32,  34,  35,  36,  42,
+          50,  54,  59,  71,  77,  80,  82,  86,  89,  93,  32,  33,  35,  37,
+          38,  42,  49,  53,  58,  69,  75,  78,  82,  86,  89,  92,  34,  34,
+          37,  39,  42,  48,  54,  58,  63,  73,  79,  78,  80,  83,  88,  92,
+          35,  34,  37,  41,  45,  50,  57,  61,  65,  76,  82,  83,  84,  84,
+          87,  90,  36,  34,  38,  43,  48,  54,  60,  64,  68,  78,  84,  87,
+          86,  89,  90,  90,  39,  37,  40,  45,  50,  58,  65,  69,  73,  84,
+          89,  89,  91,  91,  93,  96,  44,  41,  43,  48,  53,  63,  71,  75,
+          79,  90,  95,  93,  94,  95,  97,  97,  46,  43,  44,  49,  55,  65,
+          73,  78,  82,  93,  98,  100, 98,  100, 99,  103, 48,  45,  46,  51,
+          56,  67,  76,  80,  85,  96,  102, 102, 105, 102, 105, 104, 53,  49,
+          50,  54,  60,  71,  82,  87,  92,  103, 109, 107, 107, 110, 107, 111,
+          58,  54,  54,  58,  63,  75,  87,  92,  98,  110, 116, 115, 112, 111,
+          115, 112, 61,  57,  56,  60,  66,  77,  89,  95,  101, 114, 120, 118,
+          119, 118, 116, 120, 65,  60,  58,  63,  68,  79,  92,  98,  105, 118,
+          124, 123, 122, 123, 124, 121, 71,  65,  63,  68,  73,  84,  97,  103,
+          111, 125, 132, 132, 130, 128, 127, 130, 79,  72,  70,  74,  79,  90,
+          104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81,  74,  71,  75,
+          80,  91,  105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82,  75,
+          72,  76,  81,  92,  106, 113, 121, 136, 144, 151, 149, 149, 146, 143,
+          88,  80,  77,  80,  85,  97,  108, 115, 126, 142, 149, 153, 153, 152,
+          152, 154, 91,  83,  80,  81,  88,  100, 106, 114, 130, 142, 148, 155,
+          162, 160, 159, 155, 94,  85,  83,  82,  91,  100, 105, 118, 131, 137,
+          153, 160, 165, 167, 166, 168, 97,  88,  86,  85,  94,  100, 107, 123,
+          128, 140, 157, 161, 167, 173, 171, 169, 100, 91,  89,  87,  97,  100,
+          111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94,  93,  90,
+          98,  101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97,
+          96,  93,  100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198,
+          110, 101, 100, 97,  101, 108, 117, 123, 138, 141, 161, 165, 183, 188,
+          193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167,
+          185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151,
+          157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136,
+          136, 156, 156, 178, 179, 203, 204, 217},
+         {32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67,  68,  69,  71,  72,
+          31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64,  65,  66,  67,  68,
+          30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62,  63,  65,  66,  68,
+          32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61,  62,  63,  64,  65,
+          33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60,  60,  62,  63,  65,
+          37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58,  60,  61,  62,  63,
+          42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58,  59,  60,  62,  63,
+          45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61,  61,  60,  61,  61,
+          49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63,  62,  63,  62,  62,
+          48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65,  65,  64,  64,  65,
+          49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67,  66,  66,  66,  65,
+          49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70,  68,  68,  67,  68,
+          50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72,  72,  70,  70,  69,
+          52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74,  73,  73,  71,  72,
+          54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78,  76,  74,  75,  73,
+          55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80,  79,  78,  76,  77,
+          57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82,  80,  80,  79,  77,
+          60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86,  84,  82,  81,  81,
+          63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88,  87,  85,  84,  81,
+          64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90,  89,  87,  86,  86,
+          64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95,  93,  91,  89,  87,
+          67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96,  94,  92,  91,  91,
+          68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96,  98,  96,  94,  91,
+          69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98,  99,  98,  97,  96,
+          70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97,  99,  101, 98,  97,
+          71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98,  102, 102, 102, 101,
+          72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102,
+          73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98,  102, 105, 106, 107,
+          74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96,  103, 105, 106, 107,
+          75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96,  103, 105, 109, 109,
+          76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97,  101, 107, 109, 110,
+          77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99,  99,  108, 108, 113}},
+        {{32,  31,  32,  32,  36,  44,  47,  53,  65,  73,  79,  87,  90,  93,
+          96,  99,  31,  32,  32,  33,  35,  42,  45,  51,  62,  69,  75,  83,
+          86,  88,  91,  94,  31,  32,  32,  33,  35,  41,  44,  49,  60,  67,
+          72,  80,  84,  87,  90,  94,  31,  32,  33,  33,  35,  41,  44,  49,
+          59,  66,  71,  79,  82,  84,  87,  90,  32,  32,  34,  34,  36,  42,
+          45,  50,  59,  65,  71,  78,  80,  83,  87,  90,  32,  33,  35,  36,
+          38,  42,  45,  49,  58,  64,  69,  76,  80,  83,  86,  88,  32,  33,
+          35,  36,  40,  44,  47,  51,  60,  66,  71,  76,  78,  81,  85,  89,
+          34,  34,  36,  38,  42,  48,  50,  54,  63,  69,  73,  80,  82,  81,
+          84,  86,  36,  34,  37,  40,  48,  54,  56,  60,  68,  74,  78,  84,
+          83,  86,  87,  87,  38,  36,  39,  41,  49,  56,  58,  63,  71,  77,
+          81,  86,  88,  88,  90,  93,  39,  37,  40,  42,  50,  58,  60,  65,
+          73,  79,  84,  90,  91,  92,  94,  93,  44,  41,  42,  45,  53,  63,
+          66,  71,  79,  85,  90,  96,  94,  96,  96,  99,  47,  44,  45,  47,
+          56,  66,  69,  75,  84,  90,  95,  99,  101, 98,  101, 99,  49,  46,
+          47,  48,  57,  67,  71,  77,  86,  93,  97,  103, 103, 105, 102, 106,
+          53,  49,  50,  51,  60,  71,  75,  82,  92,  99,  103, 111, 108, 107,
+          110, 107, 58,  54,  54,  55,  63,  75,  79,  87,  98,  105, 110, 114,
+          114, 113, 111, 115, 61,  56,  56,  57,  65,  77,  81,  89,  100, 107,
+          113, 118, 116, 117, 118, 116, 65,  60,  59,  60,  68,  79,  84,  92,
+          105, 112, 118, 126, 124, 122, 121, 124, 71,  65,  64,  65,  73,  84,
+          89,  97,  111, 119, 125, 130, 129, 129, 129, 125, 76,  69,  68,  69,
+          76,  88,  92,  101, 115, 123, 130, 134, 134, 131, 132, 135, 79,  72,
+          70,  71,  79,  90,  95,  104, 118, 127, 133, 143, 142, 141, 138, 136,
+          82,  75,  73,  74,  81,  92,  97,  106, 121, 130, 136, 146, 145, 144,
+          144, 145, 86,  78,  76,  77,  84,  95,  100, 109, 124, 133, 140, 147,
+          153, 151, 150, 146, 89,  81,  79,  78,  87,  95,  99,  112, 124, 130,
+          145, 152, 156, 157, 156, 158, 92,  84,  82,  80,  89,  95,  101, 116,
+          121, 132, 148, 151, 157, 163, 161, 159, 95,  86,  85,  83,  92,  95,
+          105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98,  89,  88,  85,
+          93,  95,  108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92,
+          91,  88,  94,  98,  110, 112, 128, 133, 146, 158, 166, 175, 179, 185,
+          104, 95,  94,  91,  95,  101, 110, 115, 129, 132, 151, 154, 171, 175,
+          181, 186, 107, 98,  97,  94,  96,  105, 110, 119, 128, 136, 149, 156,
+          173, 177, 188, 192, 110, 101, 100, 97,  98,  108, 111, 123, 127, 141,
+          147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126,
+          127, 145, 145, 166, 166, 189, 190, 201},
+         {32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68,  69,  70,  71,
+          31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64,  65,  66,  67,
+          30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63,  64,  65,  67,
+          31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61,  62,  63,  64,
+          33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60,  61,  63,  64,
+          37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59,  61,  62,  62,
+          39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58,  59,  61,  62,
+          42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60,  59,  60,  60,
+          49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61,  62,  61,  61,
+          48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64,  63,  63,  64,
+          48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65,  65,  65,  64,
+          49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67,  67,  66,  67,
+          50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71,  68,  69,  67,
+          51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72,  72,  70,  71,
+          52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75,  73,  74,  71,
+          54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78,  76,  74,  75,
+          55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79,  78,  78,  75,
+          57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83,  81,  79,  79,
+          60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85,  84,  82,  80,
+          62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87,  85,  84,  84,
+          63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91,  89,  87,  84,
+          64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92,  90,  89,  89,
+          66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95,  93,  91,  89,
+          67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96,  96,  94,  94,
+          68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97,  98,  96,  94,
+          69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99,  99,  100, 98,
+          70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98,  100, 101, 99,
+          71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99,  102, 103, 104,
+          72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104,
+          73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106,
+          74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98,  103, 105, 106,
+          75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96,  105, 105, 109}},
+        {{32,  31,  32,  32,  36,  39,  44,  53,  58,  65,  79,  81,  88,  90,
+          93,  96,  31,  32,  32,  32,  35,  38,  42,  51,  55,  62,  75,  77,
+          83,  86,  88,  91,  31,  32,  32,  32,  35,  38,  41,  50,  54,  60,
+          73,  75,  81,  84,  88,  91,  31,  32,  32,  33,  34,  37,  41,  49,
+          53,  59,  72,  74,  79,  82,  84,  87,  32,  32,  33,  34,  36,  39,
+          42,  50,  53,  59,  71,  72,  78,  81,  84,  87,  32,  32,  34,  34,
+          37,  40,  42,  49,  53,  58,  70,  71,  77,  80,  83,  85,  32,  33,
+          34,  35,  38,  40,  42,  49,  52,  58,  69,  70,  76,  78,  82,  86,
+          34,  34,  35,  37,  42,  45,  48,  54,  57,  63,  73,  75,  79,  79,
+          81,  83,  34,  34,  36,  37,  44,  47,  50,  56,  59,  65,  75,  77,
+          81,  83,  84,  84,  36,  34,  37,  38,  48,  51,  54,  60,  63,  68,
+          78,  80,  85,  85,  86,  89,  39,  37,  39,  40,  50,  54,  58,  65,
+          68,  73,  84,  85,  88,  89,  90,  89,  40,  38,  40,  41,  51,  55,
+          59,  67,  70,  75,  85,  87,  91,  92,  92,  95,  44,  41,  42,  43,
+          53,  58,  63,  71,  74,  79,  90,  91,  97,  94,  97,  95,  47,  44,
+          45,  46,  56,  61,  66,  75,  79,  85,  95,  97,  99,  101, 98,  102,
+          49,  46,  46,  47,  57,  62,  67,  77,  81,  86,  97,  99,  104, 102,
+          105, 102, 53,  49,  50,  50,  60,  65,  71,  82,  86,  92,  103, 105,
+          109, 108, 106, 110, 57,  53,  53,  53,  63,  68,  74,  86,  90,  97,
+          108, 110, 111, 112, 113, 110, 59,  54,  54,  54,  64,  69,  75,  87,
+          91,  98,  111, 112, 119, 117, 115, 118, 65,  60,  59,  58,  68,  73,
+          79,  92,  97,  105, 118, 119, 123, 123, 122, 119, 69,  63,  62,  62,
+          71,  76,  83,  96,  100, 109, 122, 124, 127, 125, 125, 128, 71,  65,
+          64,  63,  73,  78,  84,  97,  102, 111, 125, 127, 135, 134, 131, 129,
+          79,  72,  71,  70,  79,  84,  90,  104, 109, 118, 133, 135, 137, 136,
+          136, 137, 81,  74,  72,  71,  80,  85,  91,  105, 110, 120, 135, 137,
+          145, 143, 141, 138, 82,  75,  73,  72,  81,  86,  92,  106, 111, 121,
+          136, 139, 147, 148, 147, 149, 87,  79,  77,  76,  85,  90,  96,  110,
+          114, 125, 140, 143, 148, 154, 151, 149, 90,  82,  80,  78,  87,  89,
+          99,  108, 113, 129, 135, 146, 153, 157, 160, 159, 92,  84,  83,  81,
+          88,  90,  102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95,  87,
+          85,  83,  88,  92,  103, 105, 120, 125, 137, 148, 155, 164, 168, 173,
+          98,  89,  88,  85,  89,  95,  103, 108, 121, 124, 141, 144, 160, 164,
+          169, 174, 100, 92,  91,  88,  90,  98,  103, 111, 120, 127, 139, 146,
+          161, 165, 175, 179, 103, 94,  94,  90,  92,  101, 103, 114, 119, 131,
+          137, 150, 158, 170, 175, 180, 106, 97,  97,  93,  93,  104, 104, 118,
+          118, 135, 135, 154, 155, 175, 176, 187},
+         {32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68,  69,  69,
+          31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64,  65,  66,
+          31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63,  65,  66,
+          30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61,  62,  63,
+          33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60,  62,  63,
+          35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60,  61,  61,
+          37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58,  60,  61,
+          42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58,  59,  59,
+          44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61,  61,  60,
+          49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62,  62,  63,
+          48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64,  64,  63,
+          48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66,  65,  66,
+          49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67,  68,  66,
+          50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71,  68,  70,
+          51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71,  72,  70,
+          52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75,  73,  73,
+          54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77,  76,  74,
+          54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79,  77,  78,
+          57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82,  81,  78,
+          59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83,  82,  82,
+          60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87,  85,  82,
+          63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88,  87,  86,
+          64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91,  89,  87,
+          64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93,  92,  91,
+          66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95,  93,  91,
+          67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97,  97,  95,
+          68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97,  98,  96,
+          69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99,  100, 101,
+          70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99,  100, 101,
+          71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99,  102, 103,
+          72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103,
+          73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105}},
+        {{32,  31,  31,  32,  35,  36,  44,  47,  53,  62,  65,  79,  82,  88,
+          90,  93,  31,  32,  32,  32,  35,  35,  42,  45,  51,  59,  62,  75,
+          78,  83,  86,  88,  31,  32,  32,  32,  34,  35,  41,  45,  50,  58,
+          61,  74,  76,  82,  85,  88,  31,  32,  32,  33,  34,  34,  41,  44,
+          49,  57,  59,  72,  74,  79,  82,  84,  31,  32,  33,  34,  35,  36,
+          42,  44,  49,  57,  59,  71,  73,  79,  81,  84,  32,  32,  33,  34,
+          36,  36,  42,  45,  50,  57,  59,  71,  73,  78,  80,  82,  32,  33,
+          34,  35,  37,  38,  42,  45,  49,  56,  58,  69,  71,  76,  79,  83,
+          32,  33,  34,  36,  39,  40,  44,  47,  51,  58,  60,  71,  73,  76,
+          78,  80,  34,  34,  35,  37,  41,  42,  48,  50,  54,  61,  63,  73,
+          76,  81,  81,  80,  35,  34,  36,  38,  45,  47,  52,  55,  59,  65,
+          67,  77,  79,  82,  83,  86,  36,  34,  36,  38,  46,  48,  54,  56,
+          60,  66,  68,  78,  80,  85,  87,  86,  39,  37,  39,  40,  48,  50,
+          58,  60,  65,  71,  73,  84,  86,  89,  88,  91,  41,  39,  40,  41,
+          49,  51,  60,  62,  67,  74,  76,  86,  88,  91,  93,  91,  44,  41,
+          42,  43,  51,  53,  63,  66,  71,  78,  79,  90,  92,  97,  94,  97,
+          47,  44,  44,  45,  53,  56,  66,  69,  75,  82,  84,  95,  97,  98,
+          101, 98,  48,  45,  45,  46,  54,  56,  67,  70,  76,  83,  85,  96,
+          98,  104, 101, 105, 53,  49,  50,  50,  57,  60,  71,  75,  82,  90,
+          92,  103, 106, 107, 108, 105, 55,  51,  51,  51,  59,  61,  72,  77,
+          84,  92,  94,  106, 108, 111, 110, 112, 58,  54,  54,  54,  61,  63,
+          75,  79,  87,  95,  98,  110, 112, 117, 116, 113, 63,  58,  58,  57,
+          65,  67,  78,  83,  91,  100, 103, 116, 118, 119, 119, 121, 65,  60,
+          59,  58,  66,  68,  79,  84,  92,  102, 105, 118, 120, 127, 124, 122,
+          71,  65,  64,  63,  71,  73,  84,  89,  97,  108, 111, 125, 127, 129,
+          129, 130, 74,  68,  67,  66,  73,  75,  86,  91,  100, 110, 113, 128,
+          131, 135, 134, 130, 79,  72,  71,  70,  77,  79,  90,  95,  104, 115,
+          118, 133, 136, 140, 139, 140, 82,  75,  73,  72,  79,  81,  92,  97,
+          105, 117, 120, 136, 139, 145, 142, 140, 82,  75,  74,  72,  79,  81,
+          92,  97,  106, 117, 121, 136, 139, 148, 150, 149, 87,  79,  78,  76,
+          83,  85,  96,  100, 110, 120, 125, 141, 144, 148, 153, 150, 89,  82,
+          81,  78,  83,  87,  97,  99,  113, 118, 128, 139, 145, 153, 157, 161,
+          92,  84,  83,  80,  84,  89,  97,  101, 114, 116, 132, 135, 150, 153,
+          157, 162, 94,  86,  85,  82,  85,  92,  97,  104, 112, 119, 130, 136,
+          151, 154, 163, 166, 97,  88,  88,  85,  86,  94,  97,  107, 111, 123,
+          128, 140, 147, 159, 163, 167, 99,  91,  91,  87,  87,  97,  97,  110,
+          110, 126, 126, 144, 144, 163, 163, 173},
+         {32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
+          31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
+          34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
+          40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
+          46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
+          47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
+          45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
+          47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
+          50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
+          55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
+          57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
+          64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
+          66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
+          70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
+          71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
+          72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
+          53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
+          50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+          51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
+          50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
+          57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
+          59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
+          67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
+          71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
+          75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
+          82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
+          85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
+          88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
+          94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
+          97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
+          98, 102}},
+        {{32,  31,  31,  32,  33,  36,  40,  44,  51,  53,  65,  66,  79,  81,
+          87,  90,  31,  32,  32,  32,  33,  35,  39,  42,  49,  51,  62,  63,
+          75,  77,  83,  85,  31,  32,  32,  32,  33,  35,  39,  42,  49,  51,
+          61,  62,  74,  76,  82,  85,  31,  32,  32,  33,  33,  34,  38,  41,
+          47,  49,  59,  60,  72,  74,  79,  81,  31,  32,  32,  33,  34,  35,
+          38,  41,  47,  49,  59,  60,  71,  73,  79,  81,  32,  32,  33,  34,
+          35,  36,  39,  42,  48,  50,  59,  60,  71,  72,  78,  80,  32,  32,
+          33,  35,  36,  37,  40,  42,  48,  49,  58,  59,  69,  71,  77,  80,
+          32,  33,  33,  35,  36,  38,  41,  42,  48,  49,  58,  59,  69,  70,
+          75,  77,  33,  33,  34,  36,  38,  41,  44,  46,  52,  53,  62,  63,
+          72,  74,  78,  78,  34,  34,  34,  37,  39,  42,  45,  48,  53,  54,
+          63,  64,  73,  75,  80,  83,  36,  34,  35,  38,  42,  48,  51,  54,
+          58,  60,  68,  69,  78,  80,  84,  83,  36,  35,  35,  38,  42,  48,
+          51,  54,  59,  60,  68,  69,  79,  80,  85,  87,  39,  37,  38,  40,
+          44,  50,  54,  58,  63,  65,  73,  74,  84,  85,  89,  88,  40,  38,
+          39,  41,  45,  51,  56,  59,  65,  67,  75,  76,  85,  87,  90,  93,
+          44,  41,  41,  43,  46,  53,  59,  63,  69,  71,  79,  80,  90,  91,
+          96,  93,  46,  43,  43,  44,  48,  55,  60,  65,  72,  73,  82,  83,
+          93,  94,  97,  100, 48,  45,  45,  46,  50,  56,  62,  67,  74,  76,
+          85,  86,  96,  98,  103, 100, 52,  48,  48,  49,  52,  59,  65,  70,
+          78,  80,  90,  91,  101, 103, 105, 107, 53,  49,  49,  50,  53,  60,
+          66,  71,  79,  82,  92,  93,  103, 105, 111, 107, 58,  53,  53,  53,
+          57,  63,  69,  74,  83,  86,  97,  98,  109, 111, 113, 115, 58,  54,
+          54,  54,  57,  63,  70,  75,  84,  87,  98,  99,  110, 112, 118, 115,
+          65,  60,  59,  58,  62,  68,  74,  79,  89,  92,  105, 106, 118, 119,
+          122, 123, 66,  61,  60,  59,  63,  69,  75,  80,  90,  93,  106, 107,
+          119, 121, 126, 123, 71,  65,  65,  63,  67,  73,  79,  84,  94,  97,
+          111, 112, 125, 127, 131, 132, 74,  68,  67,  66,  69,  75,  81,  86,
+          97,  100, 113, 115, 128, 130, 134, 132, 79,  72,  72,  70,  73,  79,
+          85,  90,  101, 104, 118, 119, 133, 135, 141, 140, 81,  74,  73,  71,
+          75,  80,  86,  91,  102, 105, 120, 121, 135, 137, 143, 140, 82,  75,
+          74,  72,  75,  81,  87,  92,  103, 106, 121, 122, 136, 139, 147, 151,
+          86,  78,  78,  75,  78,  84,  90,  95,  106, 109, 124, 125, 140, 142,
+          147, 151, 88,  81,  80,  77,  80,  86,  90,  98,  105, 112, 122, 127,
+          140, 144, 152, 155, 91,  83,  82,  79,  80,  88,  90,  100, 103, 114,
+          119, 130, 137, 148, 151, 155, 93,  85,  85,  81,  81,  90,  90,  102,
+          103, 117, 117, 134, 134, 151, 152, 160},
+         {32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
+          31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
+          33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
+          40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
+          43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
+          47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
+          46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
+          45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
+          49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
+          50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
+          57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
+          57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
+          64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
+          65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
+          69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
+          70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
+          51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
+          48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+          49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
+          49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
+          52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
+          57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
+          62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
+          66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
+          73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
+          75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
+          83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
+          85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
+          91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
+          94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
+          95, 98}},
+        {{32,  31,  31,  32,  32,  36,  36,  44,  44,  53,  53,  65,  65,  79,
+          79,  87,  31,  32,  32,  32,  32,  35,  35,  42,  42,  51,  51,  62,
+          62,  75,  75,  82,  31,  32,  32,  32,  32,  35,  35,  42,  42,  51,
+          51,  62,  62,  75,  75,  82,  31,  32,  32,  33,  33,  34,  34,  41,
+          41,  49,  49,  59,  59,  72,  72,  78,  31,  32,  32,  33,  33,  34,
+          34,  41,  41,  49,  49,  59,  59,  72,  72,  78,  32,  32,  32,  34,
+          34,  36,  36,  42,  42,  50,  50,  59,  59,  71,  71,  77,  32,  32,
+          32,  34,  34,  36,  36,  42,  42,  50,  50,  59,  59,  71,  71,  77,
+          32,  33,  33,  35,  35,  38,  38,  42,  42,  49,  49,  58,  58,  69,
+          69,  75,  32,  33,  33,  35,  35,  38,  38,  42,  42,  49,  49,  58,
+          58,  69,  69,  75,  34,  34,  34,  37,  37,  42,  42,  48,  48,  54,
+          54,  63,  63,  73,  73,  79,  34,  34,  34,  37,  37,  42,  42,  48,
+          48,  54,  54,  63,  63,  73,  73,  79,  36,  34,  34,  38,  38,  48,
+          48,  54,  54,  60,  60,  68,  68,  78,  78,  84,  36,  34,  34,  38,
+          38,  48,  48,  54,  54,  60,  60,  68,  68,  78,  78,  84,  39,  37,
+          37,  40,  40,  50,  50,  58,  58,  65,  65,  73,  73,  84,  84,  89,
+          39,  37,  37,  40,  40,  50,  50,  58,  58,  65,  65,  73,  73,  84,
+          84,  89,  44,  41,  41,  43,  43,  53,  53,  63,  63,  71,  71,  79,
+          79,  90,  90,  95,  44,  41,  41,  43,  43,  53,  53,  63,  63,  71,
+          71,  79,  79,  90,  90,  95,  48,  45,  45,  46,  46,  56,  56,  67,
+          67,  76,  76,  85,  85,  96,  96,  102, 48,  45,  45,  46,  46,  56,
+          56,  67,  67,  76,  76,  85,  85,  96,  96,  102, 53,  49,  49,  50,
+          50,  60,  60,  71,  71,  82,  82,  92,  92,  103, 103, 109, 53,  49,
+          49,  50,  50,  60,  60,  71,  71,  82,  82,  92,  92,  103, 103, 109,
+          58,  54,  54,  54,  54,  63,  63,  75,  75,  87,  87,  98,  98,  110,
+          110, 116, 58,  54,  54,  54,  54,  63,  63,  75,  75,  87,  87,  98,
+          98,  110, 110, 116, 65,  60,  60,  58,  58,  68,  68,  79,  79,  92,
+          92,  105, 105, 118, 118, 124, 65,  60,  60,  58,  58,  68,  68,  79,
+          79,  92,  92,  105, 105, 118, 118, 124, 71,  65,  65,  63,  63,  73,
+          73,  84,  84,  97,  97,  111, 111, 125, 125, 132, 71,  65,  65,  63,
+          63,  73,  73,  84,  84,  97,  97,  111, 111, 125, 125, 132, 79,  72,
+          72,  70,  70,  79,  79,  90,  90,  104, 104, 118, 118, 133, 133, 141,
+          79,  72,  72,  70,  70,  79,  79,  90,  90,  104, 104, 118, 118, 133,
+          133, 141, 82,  75,  75,  72,  72,  81,  81,  92,  92,  106, 106, 121,
+          121, 136, 136, 144, 82,  75,  75,  72,  72,  81,  81,  92,  92,  106,
+          106, 121, 121, 136, 136, 144, 87,  79,  79,  76,  76,  84,  84,  96,
+          96,  109, 109, 124, 124, 141, 141, 149},
+         {32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
+          31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
+          31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
+          40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
+          40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
+          47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
+          47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
+          45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
+          45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
+          50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
+          50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
+          57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
+          57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
+          64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
+          64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
+          69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
+          50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
+          46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+          48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
+          47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
+          49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
+          55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
+          56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
+          64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
+          65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
+          72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
+          75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
+          82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
+          83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
+          90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
+          92, 95}},
+        {{32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53,  53,  62,  65,  73,  79,
+          31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51,  51,  60,  62,  70,  75,
+          31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51,  51,  59,  62,  69,  75,
+          31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50,  50,  58,  60,  67,  73,
+          31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49,  49,  57,  59,  66,  72,
+          31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49,  49,  57,  59,  66,  71,
+          32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50,  50,  57,  59,  65,  71,
+          32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49,  49,  56,  59,  65,  70,
+          32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49,  49,  56,  58,  64,  69,
+          32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50,  50,  56,  58,  64,  69,
+          34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,  54,  61,  63,  69,  73,
+          34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,  54,  61,  63,  69,  73,
+          35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59,  59,  65,  67,  73,  77,
+          36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60,  60,  66,  68,  74,  78,
+          38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63,  63,  69,  71,  77,  81,
+          39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65,  65,  71,  73,  79,  84,
+          41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67,  67,  74,  76,  81,  86,
+          44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71,  71,  78,  79,  85,  90,
+          44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72,  72,  79,  81,  86,  91,
+          48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76,  76,  83,  85,  91,  96,
+          48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76,  76,  83,  85,  91,  96,
+          53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81,  81,  89,  91,  98,  103,
+          53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82,  82,  90,  92,  99,  103,
+          57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85,  85,  94,  96,  103, 108,
+          58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87,  87,  95,  98,  105, 110,
+          61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89,  89,  98,  101, 108, 114,
+          65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92,  92,  102, 105, 112, 118,
+          67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94,  94,  103, 106, 114, 120,
+          71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97,  97,  108, 111, 119, 125,
+          72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98,  98,  108, 111, 119, 125,
+          79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+          79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133},
+         {32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
+          31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
+          31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
+          37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
+          40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
+          44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
+          47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
+          47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
+          45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
+          46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
+          50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
+          50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
+          55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
+          57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
+          61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
+          64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
+          49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
+          46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+          46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
+          46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
+          47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
+          50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
+          55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
+          57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
+          63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
+          64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
+          71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
+          72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
+          78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
+          82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
+          86, 89}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58,  65,  65,
+          31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56,  63,  63,
+          31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55,  62,  62,
+          31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54,  61,  61,
+          31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53,  59,  59,
+          31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53,  59,  59,
+          31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53,  59,  59,
+          32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53,  59,  59,
+          32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53,  58,  58,
+          32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52,  58,  58,
+          32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52,  58,  58,
+          33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56,  62,  62,
+          34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57,  63,  63,
+          34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59,  65,  65,
+          36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,  68,  68,
+          36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,  68,  68,
+          38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67,  72,  72,
+          39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68,  73,  73,
+          41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70,  76,  76,
+          44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74,  79,  79,
+          44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74,  79,  79,
+          47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78,  84,  84,
+          48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79,  85,  85,
+          50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82,  88,  88,
+          53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,  92,  92,
+          53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,  92,  92,
+          57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90,  97,  97,
+          58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91,  98,  98,
+          61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93,  100, 100,
+          65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97,  105, 105,
+          65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97,  105, 105,
+          70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109},
+         {32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
+          31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
+          31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
+          34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
+          40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
+          40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
+          46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
+          47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
+          46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
+          45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
+          45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
+          49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
+          50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
+          53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
+          57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
+          57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
+          48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
+          46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+          45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
+          45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
+          46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
+          46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
+          52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
+          54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
+          57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
+          62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
+          62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
+          68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
+          70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
+          73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
+          78, 78}},
+        {{32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
+          32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
+          32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
+          32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
+          32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
+          33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
+          33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
+          36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
+          36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
+          38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
+          42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
+          42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
+          47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
+          54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
+          54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
+          61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+          36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
+          37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+          37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
+          38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
+          42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
+          43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
+          45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
+          54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
+          56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
+          60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
+          71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
+          71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
+          76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
+          87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
+          87, 92},
+         {32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
+          31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
+          31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
+          32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
+          38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
+          40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
+          41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
+          46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
+          47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
+          47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
+          45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
+          45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
+          47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
+          50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
+          50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
+          54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+          49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
+          47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+          46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
+          46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
+          46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
+          46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
+          47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
+          53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
+          54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
+          55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
+          61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
+          61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
+          63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
+          68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
+          68, 71}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
+          31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
+          32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
+          32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
+          32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
+          33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
+          33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
+          35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
+          36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
+          36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
+          40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
+          42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
+          42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
+          47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
+          54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
+          34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
+          35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+          34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
+          34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
+          37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
+          40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
+          40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
+          43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
+          51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
+          53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
+          54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
+          61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
+          67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
+          67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
+          73, 79},
+         {32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
+          31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
+          31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
+          31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
+          34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
+          40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
+          40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
+          41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
+          46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
+          47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
+          47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
+          46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
+          45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
+          45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+          47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
+          50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
+          43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
+          46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+          46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
+          46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
+          46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
+          47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
+          47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
+          47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
+          51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
+          53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
+          53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
+          57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
+          59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
+          59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
+          62, 65}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
+          34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+          36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
+          36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
+          36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
+          40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+          42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
+          42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+          33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
+          34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+          34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
+          34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
+          34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
+          36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
+          38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
+          38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
+          39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
+          45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
+          50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
+          50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
+          52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
+          58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
+          63, 63},
+         {32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
+          31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
+          31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
+          31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+          31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
+          35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
+          40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
+          40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
+          40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
+          44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
+          47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
+          47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
+          47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
+          46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+          45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
+          45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
+          39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
+          42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+          43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
+          43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
+          44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
+          47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
+          48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
+          48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
+          47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
+          50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
+          53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
+          53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
+          53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
+          56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
+          58, 58}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+          35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
+          36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
+          36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+          36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+          32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
+          32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+          32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
+          33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
+          33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
+          33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
+          35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
+          36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+          37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
+          37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
+          38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
+          42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
+          46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
+          48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
+          49, 49},
+         {32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
+          31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
+          31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
+          31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
+          31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
+          31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
+          34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
+          38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
+          40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
+          40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
+          40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
+          43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
+          46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
+          47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
+          47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
+          47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
+          36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
+          38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+          39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
+          40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
+          41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
+          42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
+          44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
+          46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
+          47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
+          47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
+          48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
+          50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
+          52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
+          53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
+          53, 53}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
+          34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
+          32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
+          33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
+          34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+          34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+          35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
+          36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+          36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+          37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
+          38, 39},
+         {32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
+          31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
+          31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
+          31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
+          31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
+          31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+          31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
+          32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
+          34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
+          37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
+          40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
+          40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
+          40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
+          40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
+          41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
+          44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
+          33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
+          34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+          35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
+          35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
+          37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
+          38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
+          39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
+          41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
+          43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
+          45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
+          47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
+          47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
+          47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+          47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+          48, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          34, 34},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
+          34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+          36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
+          38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
+          40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+          30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
+          32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
+          33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
+          34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
+          35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
+          36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
+          36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+          36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
+          37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
+          38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
+          40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
+          42, 44}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32}}};
+constexpr uint8_t
+    kQuantizerMatrix4x4[kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
+                       [10] = {{{32, 43, 67, 73, 94, 137, 97, 110, 150, 200},
+                                {35, 46, 60, 57, 69, 90, 66, 71, 90, 109}},
+                               {{32, 41, 63, 69, 88, 127, 92, 103, 140, 184},
+                                {33, 45, 58, 56, 66, 86, 64, 69, 87, 105}},
+                               {{32, 38, 56, 63, 78, 113, 86, 97, 130, 169},
+                                {32, 45, 55, 53, 62, 80, 63, 67, 84, 101}},
+                               {{32, 37, 54, 58, 72, 102, 81, 91, 121, 156},
+                                {32, 45, 54, 51, 59, 75, 61, 65, 81, 97}},
+                               {{32, 34, 49, 53, 64, 91, 75, 81, 112, 140},
+                                {32, 46, 53, 49, 55, 70, 58, 62, 78, 91}},
+                               {{32, 34, 48, 49, 60, 82, 72, 79, 104, 134},
+                                {32, 46, 53, 47, 54, 66, 57, 60, 75, 89}},
+                               {{32, 33, 39, 45, 51, 71, 62, 64, 87, 108},
+                                {31, 42, 48, 47, 50, 61, 53, 54, 67, 78}},
+                               {{32, 33, 38, 42, 46, 63, 55, 57, 75, 92},
+                                {31, 41, 48, 46, 48, 58, 51, 51, 62, 71}},
+                               {{32, 32, 35, 38, 40, 54, 51, 49, 64, 81},
+                                {31, 38, 47, 47, 46, 54, 49, 46, 57, 66}},
+                               {{32, 32, 34, 35, 37, 48, 43, 43, 54, 65},
+                                {31, 37, 44, 47, 47, 53, 47, 45, 53, 59}},
+                               {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54},
+                                {31, 34, 39, 42, 45, 48, 47, 46, 49, 54}},
+                               {{32, 32, 32, 32, 33, 35, 35, 35, 38, 46},
+                                {31, 32, 34, 38, 41, 47, 46, 46, 47, 52}},
+                               {{31, 32, 32, 32, 32, 33, 32, 33, 34, 35},
+                                {31, 31, 32, 34, 35, 39, 38, 40, 43, 47}},
+                               {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33},
+                                {31, 31, 31, 31, 31, 32, 34, 35, 35, 39}},
+                               {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32},
+                                {31, 31, 31, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix8x8
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][36] = {
+        {{32,  32,  35,  38,  40,  54,  51,  49,  65,  82,  68,  63,
+          78,  97,  117, 84,  76,  91,  111, 134, 152, 95,  89,  98,
+          113, 138, 159, 183, 109, 102, 106, 121, 142, 168, 199, 220},
+         {31, 38, 47, 47,  46, 54, 50, 47, 57, 66, 57,  52,
+          61, 72, 82, 63,  57, 66, 77, 88, 96, 67, 62,  67,
+          75, 86, 95, 104, 71, 67, 68, 75, 84, 95, 107, 113}},
+        {{32,  32,  35,  37,  39,  51, 47,  46,  60,  73,  62,  58,
+          71,  87,  105, 78,  72,  84, 100, 121, 140, 90,  84,  93,
+          106, 129, 148, 169, 102, 96, 100, 113, 132, 155, 183, 201},
+         {31, 38, 47, 47,  47, 53, 48, 46, 55, 62, 54,  50,
+          58, 67, 76, 61,  55, 63, 72, 83, 91, 66, 61,  65,
+          73, 84, 92, 101, 69, 65, 66, 73, 82, 92, 103, 109}},
+        {{32,  32,  34,  35,  37, 48, 46, 45,  56,  70,  57,  54,
+          64,  80,  93,  76,  70, 79, 96, 111, 134, 85,  79,  87,
+          100, 121, 138, 156, 96, 90, 93, 105, 122, 144, 168, 184},
+         {31, 36, 43, 47, 47, 53, 48, 46, 54, 61, 52, 49,
+          55, 65, 71, 60, 55, 60, 70, 78, 89, 64, 59, 63,
+          71, 81, 89, 97, 67, 63, 64, 71, 79, 89, 99, 104}},
+        {{32, 32,  33,  35,  36, 46, 42, 42,  52,  63,  53,  51,
+          60, 73,  86,  68,  64, 72, 84, 100, 117, 78,  74,  80,
+          92, 109, 128, 140, 90, 84, 87, 98,  114, 133, 155, 168},
+         {31, 34, 39, 46, 47, 52, 47, 45, 52, 58, 50, 48,
+          54, 62, 68, 57, 53, 58, 65, 73, 82, 61, 57, 61,
+          68, 77, 86, 91, 65, 61, 62, 68, 76, 86, 95, 100}},
+        {{32, 32,  33,  34,  35, 39, 39, 40, 46,  56,  50,  48,
+          53, 65,  78,  62,  59, 63, 75, 90, 105, 76,  71,  74,
+          86, 101, 118, 134, 84, 79, 81, 92, 106, 123, 142, 153},
+         {31, 34, 39, 42, 45, 48, 47, 46, 49, 55, 49, 47,
+          50, 58, 65, 54, 51, 53, 61, 69, 76, 60, 56, 57,
+          65, 73, 82, 89, 64, 59, 60, 66, 74, 83, 92, 96}},
+        {{32, 32, 33,  34,  35, 39, 38, 39, 45, 54,  46,  45,
+          51, 61, 71,  56,  54, 58, 69, 80, 92, 68,  64,  68,
+          78, 90, 103, 117, 78, 74, 76, 86, 99, 113, 128, 140},
+         {31, 34, 39, 42, 45, 48, 47, 46, 49, 54, 48, 46,
+          50, 56, 61, 52, 49, 52, 58, 65, 71, 57, 53, 55,
+          61, 68, 75, 82, 61, 57, 58, 64, 71, 79, 86, 91}},
+        {{31, 32, 32, 32, 33, 35, 35, 35, 38, 48, 42,  41,
+          43, 54, 63, 51, 49, 49, 59, 71, 81, 59, 56,  56,
+          66, 77, 89, 98, 69, 65, 64, 73, 85, 97, 108, 119},
+         {31, 32, 35, 38, 42, 47, 48, 47, 48, 53, 47, 45,
+          45, 53, 58, 50, 47, 47, 54, 61, 66, 53, 50, 49,
+          56, 63, 69, 73, 57, 54, 52, 58, 65, 72, 77, 82}},
+        {{31, 32, 32, 32, 32, 35, 34, 34, 37, 42, 38, 37,
+          40, 47, 54, 46, 44, 45, 52, 60, 69, 52, 49, 49,
+          56, 65, 75, 82, 63, 59, 58, 65, 73, 84, 92, 105},
+         {31, 31, 32, 38, 40, 47, 44, 44, 47, 50, 47, 45,
+          46, 51, 54, 48, 46, 46, 51, 56, 61, 50, 47, 47,
+          52, 57, 63, 66, 55, 52, 50, 54, 60, 66, 70, 76}},
+        {{31, 32, 32, 32, 32, 34, 34, 33, 35, 39, 35, 34,
+          37, 42, 48, 41, 40, 41, 47, 53, 60, 47, 44, 45,
+          51, 57, 65, 71, 53, 50, 51, 55, 61, 70, 77, 85},
+         {31, 31, 32, 35, 36, 41, 42, 42, 45, 48, 48, 46,
+          47, 50, 53, 47, 45, 45, 49, 53, 57, 49, 46, 46,
+          50, 54, 59, 61, 51, 48, 48, 51, 54, 60, 64, 68}},
+        {{31, 31, 32, 32, 32, 33, 32, 32, 34, 35, 34, 34,
+          35, 37, 41, 37, 36, 38, 39, 45, 51, 43, 41, 42,
+          42, 49, 56, 63, 47, 44, 45, 46, 52, 59, 67, 71},
+         {31, 31, 32, 34, 35, 39, 37, 40, 43, 47, 43, 43,
+          45, 47, 49, 48, 46, 46, 47, 50, 53, 47, 45, 45,
+          45, 50, 55, 58, 49, 46, 46, 46, 50, 55, 60, 61}},
+        {{31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 33, 33,
+          34, 35, 37, 34, 34, 35, 36, 39, 43, 37, 36, 37,
+          38, 41, 46, 51, 41, 39, 40, 41, 44, 49, 54, 58},
+         {31, 31, 31, 32, 33, 35, 35, 37, 39, 43, 39, 41,
+          42, 45, 47, 45, 44, 45, 47, 48, 50, 48, 46, 46,
+          47, 48, 51, 53, 48, 46, 45, 46, 47, 51, 54, 56}},
+        {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33, 32, 32,
+          32, 34, 35, 32, 33, 33, 34, 35, 36, 34, 34, 33,
+          35, 36, 38, 39, 35, 35, 34, 36, 38, 40, 42, 48},
+         {31, 31, 31, 30, 31, 32, 34, 34, 35, 39, 36, 37,
+          39, 42, 46, 39, 40, 41, 44, 47, 47, 42, 42, 42,
+          45, 47, 48, 48, 48, 47, 46, 47, 47, 49, 50, 53}},
+        {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32,
+          32, 34, 34, 35, 33, 33, 33, 33, 35, 35, 36, 38},
+         {31, 31, 31, 31, 31, 31, 30, 31, 31, 32, 34, 34,
+          35, 35, 39, 35, 35, 36, 36, 40, 41, 37, 38, 39,
+          40, 43, 44, 47, 40, 41, 41, 42, 44, 45, 47, 48}},
+        {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32,
+          32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 33, 33},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+          31, 31, 32, 31, 32, 32, 32, 32, 33, 33, 34, 34,
+          35, 35, 36, 39, 33, 34, 34, 35, 35, 36, 39, 39}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix32x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][528] = {
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  33,
+          33,  32,  32,  32,  33,  34,  35,  34,  34,  33,  34,  35,  37,  39,
+          35,  34,  34,  35,  36,  37,  41,  43,  36,  35,  34,  35,  36,  38,
+          42,  45,  48,  39,  38,  37,  38,  39,  40,  45,  47,  50,  54,  44,
+          42,  41,  41,  42,  42,  47,  50,  54,  58,  63,  46,  44,  42,  43,
+          44,  44,  49,  52,  55,  59,  65,  67,  48,  46,  44,  45,  45,  46,
+          51,  53,  57,  61,  67,  69,  71,  54,  51,  49,  49,  50,  49,  54,
+          57,  60,  65,  71,  74,  76,  82,  59,  56,  54,  54,  54,  53,  58,
+          61,  64,  69,  75,  78,  80,  87,  92,  62,  59,  56,  56,  56,  55,
+          60,  63,  66,  71,  77,  80,  83,  89,  95,  98,  65,  62,  59,  59,
+          59,  58,  63,  65,  68,  73,  79,  82,  85,  92,  98,  101, 105, 71,
+          68,  65,  64,  64,  63,  68,  70,  73,  78,  84,  87,  90,  97,  103,
+          107, 111, 117, 80,  76,  72,  72,  71,  69,  74,  76,  79,  84,  90,
+          93,  96,  104, 110, 114, 118, 125, 134, 81,  77,  73,  73,  72,  70,
+          75,  77,  80,  85,  91,  94,  97,  105, 111, 115, 119, 126, 135, 137,
+          83,  78,  75,  74,  74,  72,  76,  79,  81,  86,  92,  95,  99,  106,
+          113, 117, 121, 128, 137, 138, 140, 88,  84,  80,  79,  78,  76,  80,
+          82,  85,  91,  95,  98,  103, 111, 115, 119, 126, 134, 139, 144, 147,
+          152, 91,  86,  83,  82,  81,  79,  81,  84,  88,  92,  95,  100, 107,
+          110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94,  89,  86,  85,
+          84,  82,  82,  86,  90,  92,  97,  103, 105, 111, 119, 121, 128, 136,
+          139, 146, 156, 158, 161, 166, 97,  92,  90,  88,  86,  85,  84,  89,
+          91,  95,  100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163,
+          166, 168, 174, 101, 95,  93,  91,  89,  89,  87,  91,  93,  98,  101,
+          105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176,
+          183, 104, 99,  97,  94,  93,  93,  90,  92,  96,  100, 102, 108, 111,
+          116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+          107, 102, 101, 97,  96,  96,  93,  93,  99,  101, 105, 110, 113, 120,
+          122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+          111, 105, 104, 101, 100, 99,  97,  96,  102, 103, 109, 111, 117, 120,
+          125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202,
+          210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119,
+          121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204,
+          210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112,
+          117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193,
+          197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107,
+          107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176,
+          177, 190, 191, 204, 206, 222, 224, 230, 232, 242},
+         {32,  31,  31,  30,  31,  32,  32,  33,  33,  35,  33,  34,  35,  37,
+          39,  36,  38,  40,  41,  43,  47,  41,  42,  42,  43,  45,  47,  48,
+          45,  45,  44,  45,  46,  47,  49,  50,  49,  47,  46,  47,  47,  48,
+          50,  51,  53,  48,  47,  45,  46,  46,  46,  49,  51,  53,  54,  49,
+          47,  45,  45,  45,  45,  49,  51,  53,  55,  58,  50,  47,  45,  46,
+          46,  46,  49,  51,  54,  56,  59,  60,  50,  48,  46,  46,  46,  46,
+          50,  52,  54,  56,  60,  60,  61,  52,  50,  47,  47,  47,  47,  50,
+          52,  54,  57,  61,  62,  63,  66,  54,  52,  49,  49,  49,  48,  52,
+          53,  55,  58,  62,  64,  65,  68,  71,  56,  53,  51,  50,  50,  49,
+          52,  54,  56,  59,  63,  64,  66,  69,  72,  73,  57,  54,  52,  51,
+          51,  50,  53,  55,  56,  60,  63,  65,  67,  70,  73,  75,  76,  60,
+          57,  54,  54,  53,  52,  55,  57,  58,  61,  65,  67,  68,  72,  75,
+          77,  79,  82,  63,  60,  57,  57,  56,  54,  57,  59,  60,  63,  67,
+          69,  71,  75,  78,  80,  82,  85,  89,  64,  61,  58,  57,  57,  55,
+          58,  59,  61,  64,  67,  69,  71,  75,  78,  80,  82,  85,  89,  90,
+          65,  61,  58,  58,  57,  55,  58,  60,  61,  64,  68,  70,  71,  75,
+          79,  81,  83,  86,  90,  91,  91,  67,  63,  61,  60,  59,  57,  60,
+          61,  63,  66,  69,  70,  73,  77,  79,  81,  85,  88,  90,  92,  94,
+          96,  68,  64,  62,  61,  60,  58,  59,  61,  64,  66,  67,  71,  74,
+          75,  78,  82,  84,  86,  90,  93,  94,  96,  98,  69,  65,  63,  62,
+          61,  59,  59,  62,  64,  65,  68,  71,  72,  75,  79,  80,  83,  87,
+          89,  92,  96,  97,  98,  100, 70,  66,  64,  63,  62,  61,  60,  63,
+          64,  66,  69,  70,  73,  76,  77,  81,  84,  85,  89,  92,  93,  98,
+          99,  100, 102, 71,  67,  66,  64,  63,  62,  61,  63,  64,  67,  68,
+          70,  74,  75,  78,  81,  83,  86,  88,  91,  94,  95,  100, 101, 102,
+          104, 72,  68,  67,  65,  64,  64,  61,  63,  65,  67,  68,  71,  73,
+          75,  78,  79,  84,  85,  88,  91,  93,  97,  98,  102, 103, 104, 106,
+          73,  69,  68,  66,  65,  65,  63,  63,  66,  67,  69,  71,  73,  76,
+          77,  81,  82,  85,  88,  90,  94,  95,  99,  101, 104, 105, 106, 109,
+          74,  70,  70,  67,  66,  66,  64,  63,  66,  67,  70,  71,  74,  75,
+          78,  80,  82,  86,  87,  91,  92,  96,  98,  101, 104, 106, 108, 108,
+          111, 75,  71,  71,  68,  68,  67,  66,  64,  66,  68,  70,  71,  74,
+          75,  79,  79,  84,  84,  88,  90,  93,  95,  98,  101, 103, 107, 108,
+          110, 111, 113, 76,  72,  72,  69,  69,  68,  67,  65,  66,  69,  70,
+          72,  74,  76,  78,  81,  83,  85,  88,  90,  93,  95,  98,  100, 104,
+          105, 109, 111, 112, 113, 116, 78,  74,  74,  70,  70,  69,  69,  66,
+          66,  70,  70,  74,  74,  77,  78,  82,  82,  86,  87,  92,  92,  96,
+          97,  102, 102, 107, 107, 112, 113, 115, 115, 118}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  32,  32,  32,  33,  34,  35,  32,  33,  33,  33,  34,  36,  36,
+          34,  34,  33,  34,  35,  37,  38,  39,  36,  35,  34,  35,  36,  38,
+          40,  42,  48,  38,  37,  36,  36,  38,  39,  41,  44,  50,  51,  39,
+          38,  37,  38,  39,  40,  42,  45,  50,  52,  54,  44,  42,  41,  41,
+          42,  42,  44,  47,  54,  56,  58,  63,  47,  45,  44,  44,  45,  45,
+          47,  50,  56,  58,  60,  66,  69,  49,  47,  46,  45,  46,  46,  48,
+          51,  57,  60,  62,  68,  71,  73,  54,  51,  50,  49,  50,  49,  51,
+          54,  60,  63,  65,  71,  75,  77,  82,  59,  56,  54,  54,  54,  53,
+          55,  58,  64,  67,  69,  75,  79,  81,  87,  92,  61,  58,  56,  56,
+          56,  55,  57,  60,  65,  68,  70,  77,  81,  83,  89,  94,  97,  65,
+          62,  60,  59,  59,  58,  60,  63,  68,  71,  73,  79,  84,  87,  92,
+          98,  101, 105, 71,  68,  65,  65,  64,  63,  65,  68,  73,  76,  78,
+          84,  89,  92,  97,  103, 106, 111, 117, 76,  72,  70,  69,  68,  66,
+          68,  71,  76,  79,  81,  88,  92,  95,  101, 107, 110, 115, 122, 127,
+          80,  76,  73,  72,  71,  69,  71,  74,  79,  82,  84,  90,  95,  98,
+          104, 110, 113, 118, 125, 130, 134, 83,  78,  76,  75,  74,  72,  73,
+          76,  81,  84,  86,  92,  97,  100, 106, 113, 116, 121, 128, 133, 137,
+          140, 86,  82,  79,  78,  77,  74,  76,  79,  84,  87,  89,  95,  100,
+          103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89,  85,  82,  81,
+          79,  78,  78,  82,  86,  87,  92,  97,  100, 105, 112, 114, 120, 128,
+          131, 136, 146, 147, 150, 155, 92,  88,  85,  84,  82,  81,  80,  85,
+          86,  90,  95,  97,  102, 107, 110, 117, 122, 125, 134, 138, 142, 152,
+          154, 156, 162, 95,  90,  88,  86,  85,  84,  82,  86,  88,  93,  95,
+          99,  105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163,
+          169, 98,  93,  91,  89,  88,  87,  85,  87,  90,  94,  96,  102, 104,
+          109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+          101, 96,  95,  92,  91,  90,  88,  88,  93,  95,  99,  103, 106, 112,
+          114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+          104, 99,  98,  95,  94,  93,  91,  90,  95,  96,  102, 103, 109, 112,
+          117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186,
+          193, 108, 102, 101, 98,  97,  96,  95,  93,  97,  100, 104, 106, 111,
+          113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188,
+          192, 194, 201, 111, 105, 105, 101, 100, 99,  98,  96,  98,  103, 105,
+          109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
+          181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99,
+          100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162,
+          162, 175, 176, 187, 188, 203, 204, 210, 211, 219},
+         {32,  31,  31,  30, 31, 31,  31, 32, 32, 33,  33,  34,  35,  36,  39,
+          36,  38,  39,  40, 43, 47,  38, 40, 41, 41,  44,  47,  47,  41,  42,
+          42,  43,  45,  47, 48, 48,  49, 47, 46, 46,  47,  48,  49,  50,  53,
+          49,  47,  46,  46, 46, 47,  48, 50, 53, 53,  48,  47,  46,  45,  46,
+          46,  48,  49,  53, 54, 54,  49, 47, 45, 45,  45,  45,  47,  49,  53,
+          55,  55,  58,  50, 48, 46,  46, 46, 46, 47,  50,  54,  55,  56,  59,
+          61,  51,  48,  47, 46, 47,  46, 47, 50, 54,  55,  56,  60,  61,  62,
+          52,  50,  48,  47, 47, 47,  48, 50, 54, 56,  57,  61,  63,  64,  66,
+          54,  52,  50,  49, 49, 48,  49, 52, 55, 57,  58,  62,  64,  66,  68,
+          71,  55,  53,  51, 50, 50,  49, 50, 52, 56,  58,  59,  63,  65,  66,
+          69,  72,  73,  57, 54, 52,  51, 51, 50, 51,  53,  56,  58,  60,  63,
+          66,  67,  70,  73, 74, 76,  60, 57, 55, 54,  53,  52,  53,  55,  58,
+          60,  61,  65,  68, 69, 72,  75, 77, 79, 82,  62,  59,  57,  56,  55,
+          53,  54,  56,  59, 61, 63,  66, 69, 70, 74,  77,  78,  80,  84,  86,
+          63,  60,  58,  57, 56, 54,  55, 57, 60, 62,  63,  67,  70,  71,  75,
+          78,  79,  82,  85, 87, 89,  65, 61, 59, 58,  57,  55,  56,  58,  61,
+          63,  64,  68,  71, 72, 75,  79, 80, 83, 86,  88,  90,  91,  66,  63,
+          60,  59,  58,  56, 58, 59,  62, 64, 65, 69,  72,  73,  76,  80,  81,
+          84,  87,  90,  91, 93, 94,  67, 64, 62, 61,  59,  58,  58,  60,  63,
+          64,  66,  69,  71, 73, 77,  78, 81, 85, 86,  89,  93,  94,  95,  97,
+          68,  65,  63,  62, 60, 59,  58, 61, 62, 64,  67,  68,  71,  74,  75,
+          79,  81,  83,  87, 89, 91,  95, 96, 97, 99,  69,  66,  64,  63,  61,
+          61,  59,  61,  62, 65, 66,  68, 72, 73, 76,  78,  80,  84,  85,  88,
+          91,  92,  97,  98, 98, 101, 70, 67, 65, 63,  62,  62,  60,  61,  63,
+          65,  66,  69,  71, 73, 76,  77, 81, 83, 85,  88,  90,  94,  95,  99,
+          100, 100, 103, 71, 67, 67,  64, 63, 63, 61,  61,  64,  65,  67,  69,
+          71,  74,  75,  78, 80, 83,  85, 87, 91, 92,  95,  97,  100, 102, 102,
+          105, 72,  68,  68, 65, 65,  64, 62, 62, 64,  65,  68,  69,  72,  73,
+          76,  78,  80,  83, 84, 88,  89, 93, 95, 97,  100, 102, 104, 104, 107,
+          73,  69,  69,  66, 66, 65,  64, 63, 64, 66,  68,  69,  72,  73,  77,
+          77,  81,  82,  86, 87, 90,  92, 95, 97, 99,  103, 104, 106, 106, 109,
+          74,  70,  70,  67, 67, 66,  65, 63, 64, 67,  68,  70,  72,  74,  76,
+          78,  80,  82,  85, 87, 90,  91, 95, 96, 100, 101, 105, 106, 108, 108,
+          111, 75,  71,  71, 68, 68,  66, 66, 64, 64,  68,  68,  71,  71,  75,
+          75,  79,  79,  83, 84, 88,  89, 93, 93, 98,  98,  102, 103, 108, 108,
+          110, 110, 113}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  32,  32,  32,  32,  33,  34,  32,  32,  32,  32,  34,  34,  35,
+          34,  34,  33,  33,  35,  36,  37,  39,  34,  34,  34,  34,  36,  36,
+          37,  41,  42,  36,  35,  34,  34,  36,  37,  38,  42,  45,  48,  39,
+          38,  38,  37,  39,  40,  40,  45,  47,  50,  54,  41,  39,  39,  38,
+          40,  40,  41,  46,  48,  51,  55,  56,  44,  42,  41,  41,  42,  42,
+          42,  47,  50,  54,  58,  59,  63,  48,  46,  45,  44,  45,  45,  45,
+          50,  53,  56,  61,  62,  66,  70,  49,  47,  46,  45,  46,  46,  46,
+          51,  53,  57,  62,  63,  68,  71,  73,  54,  51,  50,  49,  50,  49,
+          49,  54,  56,  60,  65,  67,  71,  76,  77,  82,  58,  55,  54,  53,
+          53,  53,  52,  57,  59,  63,  68,  70,  74,  79,  81,  86,  90,  59,
+          57,  55,  54,  54,  54,  54,  59,  61,  64,  69,  71,  75,  80,  82,
+          87,  91,  93,  65,  62,  60,  59,  59,  58,  58,  63,  65,  68,  73,
+          75,  79,  85,  87,  92,  97,  99,  105, 69,  66,  64,  63,  63,  62,
+          61,  66,  68,  71,  76,  78,  83,  88,  90,  96,  100, 102, 109, 113,
+          71,  68,  66,  65,  64,  63,  63,  68,  70,  73,  78,  80,  84,  90,
+          92,  97,  102, 104, 111, 115, 117, 80,  76,  73,  72,  71,  70,  69,
+          74,  76,  79,  84,  86,  90,  96,  98,  104, 109, 111, 118, 123, 125,
+          134, 81,  77,  75,  74,  73,  72,  71,  75,  77,  80,  85,  87,  91,
+          97,  99,  105, 110, 112, 120, 125, 127, 136, 137, 83,  78,  76,  75,
+          74,  73,  72,  76,  78,  81,  86,  88,  92,  98,  100, 106, 111, 113,
+          121, 126, 128, 137, 139, 140, 87,  83,  81,  79,  78,  77,  75,  80,
+          82,  85,  90,  91,  96,  101, 103, 110, 114, 117, 125, 129, 133, 142,
+          143, 145, 150, 90,  85,  83,  81,  80,  79,  78,  81,  83,  87,  89,
+          93,  98,  100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+          156, 93,  88,  86,  84,  83,  82,  80,  82,  85,  89,  90,  96,  98,
+          102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+          95,  90,  89,  86,  85,  85,  83,  83,  88,  89,  93,  97,  99,  105,
+          106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+          98,  93,  92,  89,  88,  87,  86,  85,  89,  90,  96,  97,  102, 105,
+          109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170,
+          176, 101, 96,  95,  91,  91,  90,  89,  87,  90,  93,  97,  99,  104,
+          105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172,
+          176, 177, 184, 104, 99,  98,  94,  94,  92,  92,  90,  92,  96,  98,
+          102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163,
+          166, 177, 179, 184, 185, 191, 107, 101, 101, 97,  97,  95,  95,  93,
+          93,  99,  99,  105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149,
+          149, 161, 161, 172, 172, 185, 186, 191, 192, 199},
+         {32,  31,  31, 30, 31, 31, 30, 31, 31, 32, 33, 34,  35,  35,  39,
+          35,  36,  37, 37, 41, 43, 36, 38, 39, 40, 43, 45,  47,  41,  42,
+          42,  42,  45, 46, 47, 48, 44, 44, 44, 44, 46, 46,  47,  49,  50,
+          49,  47,  47, 46, 47, 47, 48, 50, 51, 53, 48, 47,  46,  45,  46,
+          46,  46,  49, 51, 53, 54, 48, 47, 46, 45, 46, 46,  46,  49,  51,
+          53,  54,  55, 49, 47, 46, 45, 45, 45, 45, 49, 51,  53,  55,  56,
+          58,  50,  48, 47, 46, 46, 46, 46, 50, 51, 54, 56,  57,  59,  61,
+          51,  48,  47, 46, 47, 46, 46, 50, 51, 54, 56, 57,  60,  62,  62,
+          52,  50,  48, 47, 47, 47, 47, 50, 52, 54, 57, 58,  61,  63,  64,
+          66,  54,  51, 50, 49, 49, 48, 48, 51, 53, 55, 58,  59,  62,  64,
+          65,  68,  70, 55, 52, 51, 50, 49, 49, 48, 52, 53,  55,  59,  60,
+          62,  65,  66, 68, 70, 71, 57, 54, 53, 52, 51, 50,  50,  53,  54,
+          56,  60,  61, 63, 66, 67, 70, 73, 73, 76, 59, 56,  54,  53,  53,
+          52,  51,  54, 56, 58, 61, 62, 65, 68, 69, 72, 74,  75,  78,  80,
+          60,  57,  55, 54, 53, 53, 52, 55, 56, 58, 61, 63,  65,  68,  69,
+          72,  75,  76, 79, 81, 82, 63, 60, 58, 57, 56, 55,  54,  57,  59,
+          60,  63,  65, 67, 70, 71, 75, 77, 78, 82, 84, 85,  89,  64,  61,
+          59,  58,  57, 56, 55, 58, 59, 61, 64, 65, 68, 71,  72,  75,  78,
+          79,  82,  85, 86, 89, 90, 65, 61, 60, 58, 57, 56,  55,  58,  59,
+          61,  64,  65, 68, 71, 72, 75, 78, 79, 83, 85, 86,  90,  91,  91,
+          67,  63,  61, 60, 59, 58, 57, 60, 61, 63, 65, 66,  69,  72,  73,
+          77,  79,  80, 84, 86, 88, 92, 93, 93, 95, 68, 64,  63,  61,  60,
+          59,  58,  60, 61, 63, 65, 67, 70, 71, 74, 76, 78,  81,  83,  86,
+          88,  89,  94, 94, 95, 97, 68, 65, 64, 62, 61, 60,  58,  59,  61,
+          64,  64,  68, 69, 71, 74, 75, 79, 80, 83, 86, 87,  91,  92,  95,
+          96,  97,  99, 69, 66, 65, 63, 62, 61, 59, 59, 62,  63,  65,  67,
+          69,  72,  72, 76, 78, 80, 83, 84, 88, 89, 92, 94,  97,  98,  99,
+          101, 70,  67, 66, 63, 63, 62, 61, 60, 63, 63, 66,  67,  69,  71,
+          73,  76,  77, 81, 82, 85, 86, 90, 91, 94, 96, 99,  100, 100, 103,
+          71,  67,  67, 64, 64, 63, 62, 61, 62, 64, 66, 67,  70,  71,  74,
+          74,  78,  79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+          72,  68,  68, 65, 65, 64, 63, 61, 62, 65, 66, 68,  69,  71,  73,
+          75,  77,  79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104,
+          106, 73,  69, 69, 66, 66, 64, 64, 62, 62, 66, 66,  69,  69,  72,
+          73,  76,  77, 81, 81, 85, 85, 89, 90, 94, 94, 99,  99,  104, 104,
+          106, 106, 108}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  31,  32,  32,  32,  33,  33,  32,  32,  32,  32,  33,  34,  35,
+          32,  33,  33,  33,  34,  34,  36,  36,  34,  34,  34,  33,  35,  35,
+          37,  38,  39,  35,  35,  34,  34,  36,  36,  38,  39,  42,  46,  36,
+          35,  35,  34,  36,  36,  38,  40,  42,  47,  48,  39,  38,  38,  37,
+          39,  39,  40,  42,  45,  49,  50,  54,  41,  40,  39,  38,  40,  40,
+          41,  43,  46,  50,  52,  55,  57,  44,  42,  42,  41,  42,  42,  42,
+          44,  47,  52,  54,  58,  60,  63,  47,  45,  45,  44,  44,  45,  45,
+          47,  50,  55,  56,  60,  62,  66,  69,  48,  46,  45,  44,  45,  45,
+          46,  47,  51,  55,  57,  61,  63,  67,  70,  71,  54,  51,  50,  49,
+          49,  50,  49,  51,  54,  59,  60,  65,  67,  71,  75,  76,  82,  56,
+          53,  52,  51,  51,  51,  51,  53,  56,  60,  61,  66,  69,  73,  77,
+          78,  84,  86,  59,  56,  55,  54,  54,  54,  53,  55,  58,  62,  64,
+          69,  71,  75,  79,  80,  87,  89,  92,  64,  61,  60,  58,  58,  58,
+          57,  59,  62,  66,  67,  72,  75,  79,  83,  84,  91,  93,  97,  102,
+          65,  62,  61,  59,  59,  59,  58,  60,  63,  67,  68,  73,  75,  79,
+          84,  85,  92,  94,  98,  103, 105, 71,  68,  67,  65,  64,  64,  63,
+          65,  68,  72,  73,  78,  80,  84,  89,  90,  97,  100, 103, 109, 111,
+          117, 74,  71,  69,  68,  67,  67,  65,  67,  70,  74,  75,  80,  83,
+          86,  91,  93,  100, 102, 106, 112, 114, 120, 123, 80,  76,  74,  72,
+          71,  71,  69,  71,  74,  78,  79,  84,  86,  90,  95,  96,  104, 106,
+          110, 116, 118, 125, 128, 134, 82,  78,  76,  74,  73,  73,  71,  73,
+          76,  79,  80,  86,  88,  92,  97,  98,  106, 108, 112, 118, 120, 127,
+          131, 136, 139, 83,  78,  77,  75,  74,  74,  72,  73,  76,  80,  81,
+          86,  89,  92,  97,  99,  106, 109, 113, 119, 121, 128, 131, 137, 139,
+          140, 87,  83,  81,  79,  78,  78,  75,  77,  80,  83,  85,  90,  92,
+          96,  100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+          90,  85,  84,  81,  80,  80,  78,  78,  82,  84,  87,  91,  93,  98,
+          99,  106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+          92,  88,  87,  84,  83,  82,  80,  80,  84,  85,  90,  91,  95,  98,
+          102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156,
+          162, 95,  90,  89,  86,  85,  84,  83,  82,  85,  87,  91,  92,  97,
+          98,  105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158,
+          161, 162, 168, 97,  92,  92,  88,  88,  86,  86,  84,  85,  90,  91,
+          95,  97,  101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150,
+          152, 162, 164, 168, 168, 174, 100, 95,  95,  90,  90,  89,  89,  86,
+          86,  92,  92,  97,  98,  104, 104, 111, 111, 119, 119, 128, 129, 137,
+          137, 147, 148, 157, 158, 169, 170, 174, 175, 181},
+         {32,  31,  31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34,  37,
+          33,  34,  35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38,  40,
+          40,  41,  43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48,  48,
+          47,  46,  46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46,  47,
+          47,  48,  49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48,  49,
+          52,  53,  54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53,  55,
+          55,  49,  47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57,  58,
+          50,  48,  47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,  61,
+          50,  48,  47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60,  61,
+          61,  52,  50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59,  61,
+          63,  63,  66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55,  58,
+          59,  62,  64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49,  52,
+          55,  55,  58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51,  51,
+          51,  49,  51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73,  75,
+          57,  54,  53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63,  66,
+          67,  70,  71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53,  55,
+          58,  58,  61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61,  58,
+          57,  55,  55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69,  73,
+          74,  76,  79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55,  57,
+          60,  60,  63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86,  89,
+          64,  61,  60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68,  70,
+          71,  75,  77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58,  57,
+          57,  55,  56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79,  82,
+          83,  86,  88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58,  60,
+          62,  63,  66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89,  92,
+          93,  93,  95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63,  65,
+          67,  70,  70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94,  95,
+          97,  68,  65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67,  69,
+          71,  73,  75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97,  99,
+          69,  65,  65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68,  72,
+          72,  76,  76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98,  100,
+          70,  66,  66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69,  71,
+          73,  75,  77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100,
+          102, 71,  67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67,  70,
+          70,  74,  74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100,
+          101, 101, 104}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  33,  33,  32,  32,  32,  32,  33,  33,  34,
+          32,  32,  32,  32,  33,  34,  35,  35,  33,  33,  33,  33,  34,  35,
+          36,  36,  38,  34,  34,  34,  33,  34,  35,  36,  37,  39,  39,  36,
+          35,  35,  34,  35,  36,  37,  38,  42,  42,  48,  36,  35,  35,  34,
+          35,  36,  38,  38,  42,  43,  48,  49,  39,  38,  38,  37,  38,  39,
+          40,  40,  44,  45,  50,  51,  54,  41,  39,  39,  38,  39,  40,  40,
+          41,  45,  46,  51,  52,  55,  56,  44,  42,  42,  41,  41,  42,  42,
+          42,  46,  47,  54,  54,  58,  59,  63,  46,  44,  44,  42,  43,  44,
+          44,  44,  48,  49,  55,  55,  59,  61,  65,  67,  48,  46,  46,  44,
+          45,  45,  45,  46,  50,  51,  57,  57,  61,  63,  67,  69,  71,  52,
+          50,  49,  48,  48,  48,  48,  48,  52,  53,  59,  59,  64,  65,  70,
+          72,  74,  78,  54,  51,  51,  49,  49,  50,  49,  49,  53,  54,  60,
+          60,  65,  67,  71,  74,  76,  80,  82,  58,  56,  55,  53,  53,  53,
+          53,  53,  57,  58,  63,  64,  68,  70,  75,  77,  80,  84,  86,  91,
+          59,  56,  56,  54,  54,  54,  53,  53,  57,  58,  64,  64,  69,  70,
+          75,  78,  80,  85,  87,  91,  92,  65,  62,  61,  59,  59,  59,  58,
+          58,  62,  63,  68,  68,  73,  75,  79,  82,  85,  90,  92,  97,  98,
+          105, 66,  63,  63,  60,  60,  60,  59,  59,  63,  64,  69,  69,  74,
+          76,  80,  83,  86,  91,  93,  98,  99,  106, 107, 71,  68,  67,  65,
+          65,  64,  63,  63,  67,  68,  73,  73,  78,  80,  84,  87,  90,  95,
+          97,  103, 103, 111, 112, 117, 74,  71,  70,  68,  67,  67,  66,  65,
+          69,  70,  75,  75,  80,  82,  86,  89,  93,  97,  100, 105, 106, 114,
+          115, 120, 123, 80,  76,  75,  72,  72,  71,  70,  69,  73,  74,  79,
+          79,  84,  86,  90,  93,  96,  101, 104, 110, 110, 118, 119, 125, 128,
+          134, 81,  77,  77,  74,  73,  73,  71,  71,  74,  75,  80,  80,  85,
+          87,  91,  94,  98,  103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+          83,  78,  78,  75,  74,  74,  72,  72,  75,  76,  81,  81,  86,  88,
+          92,  95,  99,  104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+          86,  82,  81,  78,  77,  77,  75,  74,  78,  79,  84,  84,  89,  91,
+          95,  98,  101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144,
+          147, 89,  84,  84,  80,  80,  79,  78,  77,  79,  81,  85,  86,  91,
+          92,  97,  98,  104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145,
+          148, 149, 153, 91,  86,  86,  82,  82,  81,  80,  79,  80,  84,  85,
+          88,  91,  94,  97,  100, 104, 107, 112, 115, 120, 123, 129, 132, 138,
+          140, 148, 150, 153, 154, 159, 93,  88,  88,  84,  84,  83,  83,  80,
+          81,  86,  86,  91,  91,  96,  97,  103, 103, 110, 110, 118, 119, 126,
+          126, 135, 136, 144, 144, 155, 155, 159, 159, 164},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34,
+          35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43,
+          46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45,
+          46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47,
+          47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46,
+          46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53,
+          53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56,
+          58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+          50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+          52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
+          65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62,
+          63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59,
+          62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55,
+          56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50,
+          50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55,
+          54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71,
+          73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61,
+          63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54,
+          53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80,
+          83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67,
+          69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57,
+          56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83,
+          86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64,
+          65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63,
+          62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76,
+          79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58,
+          57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
+          89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62,
+          64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94,
+          96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68,
+          68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97,
+          99}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  32,  33,  31,  32,  32,  32,  32,  33,  33,
+          32,  32,  32,  32,  32,  34,  34,  35,  32,  32,  32,  32,  32,  34,
+          34,  35,  35,  34,  34,  34,  33,  33,  35,  35,  37,  37,  39,  34,
+          34,  34,  33,  33,  35,  35,  37,  37,  39,  39,  36,  35,  35,  34,
+          34,  36,  36,  38,  38,  42,  42,  48,  36,  35,  35,  34,  34,  36,
+          36,  38,  38,  42,  42,  48,  48,  39,  38,  38,  37,  37,  39,  39,
+          40,  40,  45,  45,  50,  50,  54,  39,  38,  38,  37,  37,  39,  39,
+          40,  40,  45,  45,  50,  50,  54,  54,  44,  42,  42,  41,  41,  42,
+          42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  44,  42,  42,  41,
+          41,  42,  42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  63,  48,
+          46,  46,  44,  44,  45,  45,  46,  46,  51,  51,  57,  57,  61,  61,
+          67,  67,  71,  48,  46,  46,  44,  44,  45,  45,  46,  46,  51,  51,
+          57,  57,  61,  61,  67,  67,  71,  71,  54,  51,  51,  49,  49,  50,
+          50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,
+          54,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,
+          65,  71,  71,  76,  76,  82,  82,  59,  56,  56,  54,  54,  54,  54,
+          53,  53,  58,  58,  64,  64,  69,  69,  75,  75,  80,  80,  87,  87,
+          92,  59,  56,  56,  54,  54,  54,  54,  53,  53,  58,  58,  64,  64,
+          69,  69,  75,  75,  80,  80,  87,  87,  92,  92,  65,  62,  62,  59,
+          59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,
+          85,  92,  92,  98,  98,  105, 65,  62,  62,  59,  59,  59,  59,  58,
+          58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,
+          98,  105, 105, 71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,
+          73,  73,  78,  78,  84,  84,  90,  90,  97,  97,  103, 103, 111, 111,
+          117, 71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,  73,  73,
+          78,  78,  84,  84,  90,  90,  97,  97,  103, 103, 111, 111, 117, 117,
+          80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,
+          84,  90,  90,  96,  96,  104, 104, 110, 110, 118, 118, 125, 125, 134,
+          80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,
+          84,  90,  90,  96,  96,  104, 104, 110, 110, 118, 118, 125, 125, 134,
+          134, 83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,  81,  81,
+          86,  86,  92,  92,  99,  99,  106, 106, 113, 113, 121, 121, 128, 128,
+          137, 137, 140, 83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,
+          81,  81,  86,  86,  92,  92,  99,  99,  106, 106, 113, 113, 121, 121,
+          128, 128, 137, 137, 140, 140, 87,  83,  83,  79,  79,  77,  77,  75,
+          75,  80,  80,  84,  84,  90,  90,  96,  96,  102, 102, 109, 109, 116,
+          116, 124, 124, 132, 132, 141, 141, 144, 144, 149},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34,
+          34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43,
+          43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45,
+          45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47,
+          47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47,
+          48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
+          53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+          54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+          49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+          50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
+          61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60,
+          60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57,
+          57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50,
+          54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49,
+          48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52,
+          52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65,
+          68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56,
+          60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51,
+          51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73,
+          76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61,
+          65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53,
+          53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75,
+          79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60,
+          63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60,
+          60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71,
+          75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57,
+          55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
+          83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58,
+          61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90,
+          90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66,
+          66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93,
+          95}},
+        {{32,  31,  31,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  32,  32,  31,  32,  32,  32,  32,  33,  33,
+          32,  32,  32,  32,  32,  33,  33,  34,  32,  32,  32,  32,  32,  33,
+          34,  34,  35,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  34,
+          34,  34,  33,  33,  34,  35,  35,  37,  37,  39,  34,  34,  34,  33,
+          33,  34,  35,  35,  37,  37,  39,  39,  35,  35,  35,  34,  34,  35,
+          36,  36,  38,  38,  42,  42,  46,  36,  35,  35,  34,  34,  35,  36,
+          37,  38,  38,  42,  42,  47,  48,  38,  37,  37,  36,  36,  37,  38,
+          38,  39,  40,  44,  44,  48,  50,  51,  39,  38,  38,  38,  37,  38,
+          39,  39,  40,  41,  45,  45,  49,  50,  52,  54,  41,  40,  40,  39,
+          38,  39,  40,  40,  41,  41,  46,  46,  50,  52,  54,  55,  57,  44,
+          42,  42,  41,  41,  41,  42,  42,  42,  43,  47,  47,  52,  54,  56,
+          58,  60,  63,  45,  43,  43,  42,  41,  42,  42,  43,  43,  43,  48,
+          48,  53,  54,  57,  58,  60,  64,  65,  48,  46,  46,  45,  44,  45,
+          45,  45,  46,  46,  51,  51,  55,  57,  59,  61,  63,  67,  68,  71,
+          48,  46,  46,  45,  44,  45,  45,  45,  46,  46,  51,  51,  55,  57,
+          59,  61,  63,  67,  68,  71,  71,  53,  51,  51,  49,  49,  49,  49,
+          49,  49,  49,  54,  54,  58,  59,  62,  64,  67,  71,  72,  75,  75,
+          81,  54,  52,  51,  50,  49,  49,  50,  49,  49,  50,  54,  54,  59,
+          60,  63,  65,  67,  71,  72,  76,  76,  81,  82,  57,  55,  55,  53,
+          52,  52,  52,  52,  52,  52,  57,  57,  61,  62,  65,  67,  70,  74,
+          75,  79,  79,  85,  85,  89,  59,  56,  56,  54,  54,  54,  54,  54,
+          53,  54,  58,  58,  62,  64,  67,  69,  71,  75,  76,  80,  80,  86,
+          87,  90,  92,  62,  59,  59,  57,  56,  56,  56,  56,  55,  56,  60,
+          60,  64,  66,  69,  71,  73,  77,  78,  83,  83,  89,  89,  93,  95,
+          98,  65,  62,  62,  60,  59,  59,  59,  59,  58,  58,  63,  63,  67,
+          68,  71,  73,  75,  79,  81,  85,  85,  91,  92,  96,  98,  101, 105,
+          67,  64,  64,  62,  61,  61,  60,  60,  59,  60,  64,  64,  68,  69,
+          72,  74,  77,  81,  82,  87,  87,  93,  94,  98,  99,  103, 106, 108,
+          71,  68,  68,  66,  65,  64,  64,  64,  63,  63,  68,  68,  72,  73,
+          76,  78,  80,  84,  85,  90,  90,  97,  97,  102, 103, 107, 111, 113,
+          117, 72,  69,  69,  66,  65,  65,  65,  64,  63,  64,  68,  68,  72,
+          73,  76,  78,  81,  85,  86,  91,  91,  97,  98,  102, 104, 108, 111,
+          113, 118, 119, 80,  76,  76,  73,  72,  72,  71,  70,  69,  70,  74,
+          74,  78,  79,  82,  84,  86,  90,  91,  96,  96,  103, 104, 108, 110,
+          114, 118, 120, 125, 126, 134, 80,  76,  76,  73,  72,  72,  71,  70,
+          69,  70,  74,  74,  78,  79,  82,  84,  86,  90,  91,  96,  96,  103,
+          104, 108, 110, 114, 118, 120, 125, 126, 134, 134},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32,
+          33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38,
+          40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42,
+          43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42,
+          42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47,
+          47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50,
+          50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
+          53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+          49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+          49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
+          58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56,
+          57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54,
+          55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50,
+          50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47,
+          47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50,
+          50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61,
+          63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54,
+          55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49,
+          49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68,
+          70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58,
+          59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51,
+          51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70,
+          72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56,
+          57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57,
+          57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66,
+          68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54,
+          53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
+          76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
+          57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
+          85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60,
+          62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+          89}},
+        {{32,  31,  31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32,  32,  32,
+          31,  32,  32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33,  31,  32,
+          32,  32,  32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33,  33,  34,
+          32,  32,  32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32,  32,  32,
+          32,  33,  34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34,  35,  35,
+          36,  36,  38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37,  37,  39,
+          39,  34,  34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40,  41,  42,
+          36,  35,  35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,  45,  48,
+          36,  35,  35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,  45,  48,
+          48,  38,  38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43,  44,  46,
+          50,  50,  52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40,  40,  44,
+          45,  47,  50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40,  40,  40,
+          41,  41,  45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42,  42,  41,
+          41,  42,  42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58,  60,  63,
+          44,  42,  42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47,  50,  54,
+          54,  57,  58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44,  45,  45,
+          45,  45,  49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69,  48,  47,
+          46,  45,  44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57,  57,  60,
+          61,  63,  67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47,  47,  47,
+          47,  47,  51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72,  73,  75,
+          54,  52,  51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54,  56,  60,
+          60,  64,  65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51,  50,  49,
+          49,  49,  50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65,  67,  71,
+          71,  75,  76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53,  53,  53,
+          52,  52,  56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78,  79,  82,
+          86,  86,  90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53,  53,  57,
+          58,  60,  64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87,  87,  91,
+          92,  61,  59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59,  60,  62,
+          65,  65,  69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93,  94,  97,
+          65,  63,  62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63,  65,  68,
+          68,  72,  73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,  101, 105,
+          65,  63,  62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63,  65,  68,
+          68,  72,  73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,  101, 105,
+          105, 70,  67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66,  67,  69,
+          72,  72,  76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105,
+          109, 109, 114},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31,
+          31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35,
+          38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40,
+          42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41,
+          41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44,
+          45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47,
+          49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
+          53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+          48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+          48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
+          54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53,
+          54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51,
+          53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45,
+          48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46,
+          46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49,
+          48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58,
+          60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50,
+          51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47,
+          47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63,
+          65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54,
+          54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49,
+          49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65,
+          66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52,
+          53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53,
+          53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60,
+          63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51,
+          51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
+          70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50,
+          52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73,
+          74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56,
+          58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78,
+          80}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32,
+          32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32,
+          32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33,
+          34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
+          37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+          39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+          36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+          36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+          48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47,
+          50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45,
+          45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40,
+          40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40,
+          41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43,
+          42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56,
+          58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45,
+          48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44,
+          44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66,
+          66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51,
+          53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45,
+          45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68,
+          68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51,
+          54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52,
+          51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63,
+          65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50,
+          51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
+          76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
+          53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
+          87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58,
+          58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+          92},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31,
+          31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35,
+          35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36,
+          37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38,
+          38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41,
+          43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
+          47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+          48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+          49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+          49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+          53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51,
+          53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49,
+          49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+          46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45,
+          45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47,
+          47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55,
+          55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+          49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46,
+          46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59,
+          59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50,
+          52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46,
+          46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60,
+          60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48,
+          50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50,
+          50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56,
+          57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48,
+          48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
+          63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
+          48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
+          68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52,
+          52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+          71}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+          35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+          36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+          34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+          34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
+          41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42,
+          42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+          40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36,
+          37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36,
+          36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39,
+          38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49,
+          50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40,
+          40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38,
+          38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55,
+          55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44,
+          47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41,
+          41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58,
+          58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43,
+          43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46,
+          45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55,
+          56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44,
+          45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
+          63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45,
+          45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68,
+          70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48,
+          50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74,
+          77},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+          31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35,
+          35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36,
+          37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40,
+          40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+          46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+          47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+          41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+          43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
+          49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+          50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48,
+          49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47,
+          47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46,
+          46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47,
+          47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52,
+          53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+          46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45,
+          45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55,
+          55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+          49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45,
+          45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55,
+          55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+          45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49,
+          48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53,
+          54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46,
+          46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
+          58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
+          46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60,
+          61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46,
+          48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63,
+          64}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+          35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+          37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37,
+          37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36,
+          37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35,
+          35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34,
+          34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35,
+          35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42,
+          42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36,
+          37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35,
+          34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+          48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39,
+          39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38,
+          37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50,
+          50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39,
+          40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39,
+          38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45,
+          45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39,
+          39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
+          52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
+          42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
+          58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
+          42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+          63},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31,
+          32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+          33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34,
+          34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35,
+          35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41,
+          41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+          47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+          36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+          39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
+          47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47,
+          47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46,
+          47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45,
+          45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44,
+          44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48,
+          47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50,
+          50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+          47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47,
+          46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+          53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47,
+          47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46,
+          45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53,
+          53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
+          46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48,
+          47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49,
+          49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45,
+          45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
+          53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+          45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
+          55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+          45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+          58}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31,
+          31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+          35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+          34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+          34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33,
+          33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36,
+          36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35,
+          35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34,
+          34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39,
+          39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35,
+          35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34,
+          34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40,
+          41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36,
+          36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35,
+          35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38,
+          38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35,
+          34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
+          42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34,
+          35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47,
+          48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37,
+          37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49,
+          50},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+          31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31,
+          31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34,
+          34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+          37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+          39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+          34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+          36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
+          46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+          45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+          43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40,
+          41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40,
+          41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41,
+          41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47,
+          47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44,
+          45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42,
+          42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48,
+          48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45,
+          45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44,
+          44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49,
+          49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+          47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48,
+          48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48,
+          48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47,
+          46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+          50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+          46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52,
+          53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
+          47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
+          53}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+          32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+          35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+          35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+          34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33,
+          33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+          36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
+          33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
+          38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33,
+          34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+          39},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30,
+          31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+          33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+          33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
+          39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37,
+          38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+          36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+          35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35,
+          36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36,
+          36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41,
+          41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39,
+          39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38,
+          38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+          47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40,
+          41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+          38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+          47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40,
+          40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39,
+          39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44,
+          44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41,
+          41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
+          47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+          42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
+          48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+          43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+          48}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32,
+          32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+          33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+          33},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+          33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+          33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34,
+          34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+          37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+          35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34,
+          34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+          37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+          35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33,
+          34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
+          35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34,
+          34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
+          38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+          35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+          39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+          36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40,
+          40}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+          30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32}}};
diff --git a/src/reconstruction.cc b/src/reconstruction.cc
new file mode 100644
index 0000000..1aa1233
--- /dev/null
+++ b/src/reconstruction.cc
@@ -0,0 +1,190 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/reconstruction.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+// Maps TransformType to dsp::Transform1D for the row transforms.
+constexpr dsp::Transform1D kRowTransform[kNumTransformTypes] = {
+    dsp::k1DTransformDct,      dsp::k1DTransformAdst,
+    dsp::k1DTransformDct,      dsp::k1DTransformAdst,
+    dsp::k1DTransformAdst,     dsp::k1DTransformDct,
+    dsp::k1DTransformAdst,     dsp::k1DTransformAdst,
+    dsp::k1DTransformAdst,     dsp::k1DTransformIdentity,
+    dsp::k1DTransformIdentity, dsp::k1DTransformDct,
+    dsp::k1DTransformIdentity, dsp::k1DTransformAdst,
+    dsp::k1DTransformIdentity, dsp::k1DTransformAdst};
+
+// Maps TransformType to dsp::Transform1D for the column transforms.
+constexpr dsp::Transform1D kColumnTransform[kNumTransformTypes] = {
+    dsp::k1DTransformDct,  dsp::k1DTransformDct,
+    dsp::k1DTransformAdst, dsp::k1DTransformAdst,
+    dsp::k1DTransformDct,  dsp::k1DTransformAdst,
+    dsp::k1DTransformAdst, dsp::k1DTransformAdst,
+    dsp::k1DTransformAdst, dsp::k1DTransformIdentity,
+    dsp::k1DTransformDct,  dsp::k1DTransformIdentity,
+    dsp::k1DTransformAdst, dsp::k1DTransformIdentity,
+    dsp::k1DTransformAdst, dsp::k1DTransformIdentity};
+
+dsp::TransformSize1D Get1DTransformSize(int size_log2) {
+  return static_cast<dsp::TransformSize1D>(size_log2 - 2);
+}
+
+// Returns the number of rows to process based on |non_zero_coeff_count|. The
+// transform loops process either 4 or a multiple of 8 rows. Use the
+// TransformClass derived from |tx_type| to determine the scan order.
+template <int tx_width>
+int GetNumRows(TransformType tx_type, int tx_height, int non_zero_coeff_count) {
+  const TransformClass tx_class = GetTransformClass(tx_type);
+
+  switch (tx_class) {
+    case kTransformClass2D:
+      if (tx_width == 4) {
+        if (non_zero_coeff_count <= 13) return 4;
+        if (non_zero_coeff_count <= 29) return 8;
+      }
+      if (tx_width == 8) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if ((non_zero_coeff_count <= 14) & (tx_height > 8)) return 4;
+        if (non_zero_coeff_count <= 43) return 8;
+        if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
+      }
+      if (tx_width == 16) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if ((non_zero_coeff_count <= 14) & (tx_height > 16)) return 4;
+        if (non_zero_coeff_count <= 36) return 8;
+        if ((non_zero_coeff_count <= 44) & (tx_height > 16)) return 8;
+        if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
+      }
+      if (tx_width == 32) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if (non_zero_coeff_count <= 36) return 8;
+        if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
+      }
+      break;
+
+    case kTransformClassHorizontal:
+      if (non_zero_coeff_count <= 4) return 4;
+      if (non_zero_coeff_count <= 8) return 8;
+      if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
+      if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
+      break;
+
+    default:
+      assert(tx_class == kTransformClassVertical);
+      if (tx_width == 4) {
+        if (non_zero_coeff_count <= 16) return 4;
+        if (non_zero_coeff_count <= 32) return 8;
+      }
+      if (tx_width == 8) {
+        if (non_zero_coeff_count <= 32) return 4;
+        if (non_zero_coeff_count <= 64) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 8x8: 63, 8x16: 127.
+        if (non_zero_coeff_count <= 128) return 16;
+        if (non_zero_coeff_count <= 192) return 24;
+      }
+      if (tx_width == 16) {
+        if (non_zero_coeff_count <= 64) return 4;
+        if (non_zero_coeff_count <= 128) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 16x8: 127, 16x16: 255.
+        if (non_zero_coeff_count <= 256) return 16;
+        if (non_zero_coeff_count <= 384) return 24;
+      }
+      if (tx_width == 32) {
+        if (non_zero_coeff_count <= 128) return 4;
+        if (non_zero_coeff_count <= 256) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 32x8 is 255, 32x16 is 511.
+        if ((non_zero_coeff_count <= 512)) return 16;
+        if ((non_zero_coeff_count <= 768)) return 24;
+      }
+      break;
+  }
+  return (tx_width >= 16) ? std::min(tx_height, 32) : tx_height;
+}
+
+}  // namespace
+
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                 TransformSize tx_size, bool lossless, Residual* const buffer,
+                 int start_x, int start_y, Array2DView<Pixel>* frame,
+                 int non_zero_coeff_count) {
+  static_assert(sizeof(Residual) == 2 || sizeof(Residual) == 4, "");
+  const int tx_width_log2 = kTransformWidthLog2[tx_size];
+  const int tx_height_log2 = kTransformHeightLog2[tx_size];
+
+  int tx_height = (non_zero_coeff_count == 1) ? 1 : kTransformHeight[tx_size];
+  if (tx_height > 4) {
+    static constexpr int (*kGetNumRows[])(TransformType tx_type, int tx_height,
+                                          int non_zero_coeff_count) = {
+        &GetNumRows<4>, &GetNumRows<8>, &GetNumRows<16>, &GetNumRows<32>,
+        &GetNumRows<32>};
+    tx_height = kGetNumRows[tx_width_log2 - 2](tx_type, tx_height,
+                                               non_zero_coeff_count);
+  }
+  assert(tx_height <= 32);
+
+  // Row transform.
+  const dsp::TransformSize1D row_transform_size =
+      Get1DTransformSize(tx_width_log2);
+  const dsp::Transform1D row_transform =
+      lossless ? dsp::k1DTransformWht : kRowTransform[tx_type];
+  const dsp::InverseTransformAddFunc row_transform_func =
+      dsp.inverse_transforms[row_transform][row_transform_size][dsp::kRow];
+  assert(row_transform_func != nullptr);
+
+  row_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+                     frame);
+
+  // Column transform.
+  const dsp::TransformSize1D column_transform_size =
+      Get1DTransformSize(tx_height_log2);
+  const dsp::Transform1D column_transform =
+      lossless ? dsp::k1DTransformWht : kColumnTransform[tx_type];
+  const dsp::InverseTransformAddFunc column_transform_func =
+      dsp.inverse_transforms[column_transform][column_transform_size]
+                            [dsp::kColumn];
+  assert(column_transform_func != nullptr);
+
+  column_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+                        frame);
+}
+
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                          TransformSize tx_size, bool lossless, int16_t* buffer,
+                          int start_x, int start_y, Array2DView<uint8_t>* frame,
+                          int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                          TransformSize tx_size, bool lossless, int32_t* buffer,
+                          int start_x, int start_y,
+                          Array2DView<uint16_t>* frame,
+                          int non_zero_coeff_count);
+#endif
+
+}  // namespace libgav1
diff --git a/src/reconstruction.h b/src/reconstruction.h
new file mode 100644
index 0000000..6d5b115
--- /dev/null
+++ b/src/reconstruction.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RECONSTRUCTION_H_
+#define LIBGAV1_SRC_RECONSTRUCTION_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the frame for the
+// transform block size |tx_size| starting at position |start_x| and |start_y|.
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                 TransformSize tx_size, bool lossless, Residual* buffer,
+                 int start_x, int start_y, Array2DView<Pixel>* frame,
+                 int non_zero_coeff_count);
+
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                                 TransformSize tx_size, bool lossless,
+                                 int16_t* buffer, int start_x, int start_y,
+                                 Array2DView<uint8_t>* frame,
+                                 int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                                 TransformSize tx_size, bool lossless,
+                                 int32_t* buffer, int start_x, int start_y,
+                                 Array2DView<uint16_t>* frame,
+                                 int non_zero_coeff_count);
+#endif
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_RECONSTRUCTION_H_
diff --git a/src/residual_buffer_pool.cc b/src/residual_buffer_pool.cc
new file mode 100644
index 0000000..e166392
--- /dev/null
+++ b/src/residual_buffer_pool.cc
@@ -0,0 +1,142 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/residual_buffer_pool.h"
+
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <utility>
+
+namespace libgav1 {
+namespace {
+
+// The maximum queue size is derived using the following formula:
+//   ((sb_size * sb_size) / 16) + (2 * (((sb_size / x) * (sb_size / y)) / 16)).
+// Where:
+//   sb_size is the superblock size (64 or 128).
+//   16 is 4*4 which is kMinTransformWidth * kMinTransformHeight.
+//   x is subsampling_x + 1.
+//   y is subsampling_y + 1.
+// The first component is for the Y plane and the second component is for the U
+// and V planes.
+// For example, for 128x128 superblocks with 422 subsampling the size is:
+//   ((128 * 128) / 16) + (2 * (((128 / 2) * (128 / 1)) / 16)) = 2048.
+//
+// First dimension: use_128x128_superblock.
+// Second dimension: subsampling_x.
+// Third dimension: subsampling_y.
+constexpr int kMaxQueueSize[2][2][2] = {
+    // 64x64 superblocks.
+    {
+        {768, 512},
+        {512, 384},
+    },
+    // 128x128 superblocks.
+    {
+        {3072, 2048},
+        {2048, 1536},
+    },
+};
+
+}  // namespace
+
+ResidualBufferStack::~ResidualBufferStack() {
+  while (top_ != nullptr) {
+    ResidualBuffer* top = top_;
+    top_ = top_->next_;
+    delete top;
+  }
+}
+
+void ResidualBufferStack::Push(std::unique_ptr<ResidualBuffer> buffer) {
+  buffer->next_ = top_;
+  top_ = buffer.release();
+  ++num_buffers_;
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferStack::Pop() {
+  std::unique_ptr<ResidualBuffer> top;
+  if (top_ != nullptr) {
+    top.reset(top_);
+    top_ = top_->next_;
+    top->next_ = nullptr;
+    --num_buffers_;
+  }
+  return top;
+}
+
+void ResidualBufferStack::Swap(ResidualBufferStack* other) {
+  std::swap(top_, other->top_);
+  std::swap(num_buffers_, other->num_buffers_);
+}
+
+ResidualBufferPool::ResidualBufferPool(bool use_128x128_superblock,
+                                       int subsampling_x, int subsampling_y,
+                                       size_t residual_size)
+    : buffer_size_(GetResidualBufferSize(
+          use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+          subsampling_x, subsampling_y, residual_size)),
+      queue_size_(kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+                               [subsampling_x][subsampling_y]) {}
+
+void ResidualBufferPool::Reset(bool use_128x128_superblock, int subsampling_x,
+                               int subsampling_y, size_t residual_size) {
+  const size_t buffer_size = GetResidualBufferSize(
+      use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+      subsampling_x, subsampling_y, residual_size);
+  const int queue_size = kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+                                      [subsampling_x][subsampling_y];
+  if (buffer_size == buffer_size_ && queue_size == queue_size_) {
+    // The existing buffers (if any) are still valid, so don't do anything.
+    return;
+  }
+  buffer_size_ = buffer_size;
+  queue_size_ = queue_size;
+  // The existing buffers (if any) are no longer valid since the buffer size or
+  // the queue size has changed. Clear the stack.
+  ResidualBufferStack buffers;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    // Move the buffers in the stack to the local variable |buffers| and clear
+    // the stack.
+    buffers.Swap(&buffers_);
+    // Release mutex_ before freeing the buffers.
+  }
+  // As the local variable |buffers| goes out of scope, its destructor frees
+  // the buffers that were in the stack.
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferPool::Get() {
+  std::unique_ptr<ResidualBuffer> buffer = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffer = buffers_.Pop();
+  }
+  if (buffer == nullptr) {
+    buffer = ResidualBuffer::Create(buffer_size_, queue_size_);
+  }
+  return buffer;
+}
+
+void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
+  buffer->transform_parameters()->Reset();
+  std::lock_guard<std::mutex> lock(mutex_);
+  buffers_.Push(std::move(buffer));
+}
+
+size_t ResidualBufferPool::Size() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return buffers_.Size();
+}
+
+}  // namespace libgav1
diff --git a/src/residual_buffer_pool.h b/src/residual_buffer_pool.h
new file mode 100644
index 0000000..f7bc75d
--- /dev/null
+++ b/src/residual_buffer_pool.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+#define LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// A simple fixed size queue implementation to hold the transform parameters
+// when |Tile::split_parse_and_decode_| is true. We don't have to do any
+// boundary checks since we always push data into the queue before accessing it.
+class TransformParameterQueue {
+ public:
+  TransformParameterQueue() = default;
+
+  // Move only.
+  TransformParameterQueue(TransformParameterQueue&& other) = default;
+  TransformParameterQueue& operator=(TransformParameterQueue&& other) = default;
+
+  LIBGAV1_MUST_USE_RESULT bool Init(int max_size) {
+    max_size_ = max_size;
+    // No initialization is necessary since the data will be always written to
+    // before being read.
+    non_zero_coeff_count_.reset(new (std::nothrow) int16_t[max_size_]);
+    tx_type_.reset(new (std::nothrow) TransformType[max_size_]);
+    return non_zero_coeff_count_ != nullptr && tx_type_ != nullptr;
+  }
+
+  // Adds the |non_zero_coeff_count| and the |tx_type| to the back of the queue.
+  void Push(int non_zero_coeff_count, TransformType tx_type) {
+    assert(back_ < max_size_);
+    non_zero_coeff_count_[back_] = non_zero_coeff_count;
+    tx_type_[back_++] = tx_type;
+  }
+
+  // Returns the non_zero_coeff_count at the front of the queue.
+  int16_t NonZeroCoeffCount() const {
+    assert(front_ != back_);
+    return non_zero_coeff_count_[front_];
+  }
+
+  // Returns the tx_type at the front of the queue.
+  TransformType Type() const {
+    assert(front_ != back_);
+    return tx_type_[front_];
+  }
+
+  // Removes the |non_zero_coeff_count| and the |tx_type| from the front of the
+  // queue.
+  void Pop() {
+    assert(front_ != back_);
+    ++front_;
+  }
+
+  // Clears the queue.
+  void Reset() {
+    front_ = 0;
+    back_ = 0;
+  }
+
+  // Used only in the tests. Returns the number of elements in the queue.
+  int Size() const { return back_ - front_; }
+
+ private:
+  int max_size_ = 0;
+  std::unique_ptr<int16_t[]> non_zero_coeff_count_;
+  std::unique_ptr<TransformType[]> tx_type_;
+  int front_ = 0;
+  int back_ = 0;
+};
+
+// This class is used for parsing and decoding a superblock. Members of this
+// class are populated in the "parse" step and consumed in the "decode" step.
+class ResidualBuffer : public Allocable {
+ public:
+  static std::unique_ptr<ResidualBuffer> Create(size_t buffer_size,
+                                                int queue_size) {
+    std::unique_ptr<ResidualBuffer> buffer(new (std::nothrow) ResidualBuffer);
+    if (buffer != nullptr) {
+      buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
+      if (buffer->buffer_ == nullptr ||
+          !buffer->transform_parameters_.Init(queue_size)) {
+        buffer = nullptr;
+      }
+    }
+    return buffer;
+  }
+
+  // Move only.
+  ResidualBuffer(ResidualBuffer&& other) = default;
+  ResidualBuffer& operator=(ResidualBuffer&& other) = default;
+
+  // Buffer used to store the residual values.
+  uint8_t* buffer() { return buffer_.get(); }
+  // Queue used to store the transform parameters.
+  TransformParameterQueue* transform_parameters() {
+    return &transform_parameters_;
+  }
+
+ private:
+  friend class ResidualBufferStack;
+
+  ResidualBuffer() = default;
+
+  AlignedUniquePtr<uint8_t> buffer_;
+  TransformParameterQueue transform_parameters_;
+  // Used by ResidualBufferStack to form a chain of ResidualBuffers.
+  ResidualBuffer* next_ = nullptr;
+};
+
+// A LIFO stack of ResidualBuffers. Owns the buffers in the stack.
+class ResidualBufferStack {
+ public:
+  ResidualBufferStack() = default;
+
+  // Not copyable or movable
+  ResidualBufferStack(const ResidualBufferStack&) = delete;
+  ResidualBufferStack& operator=(const ResidualBufferStack&) = delete;
+
+  ~ResidualBufferStack();
+
+  // Pushes |buffer| to the top of the stack.
+  void Push(std::unique_ptr<ResidualBuffer> buffer);
+
+  // If the stack is non-empty, returns the buffer at the top of the stack and
+  // removes it from the stack. If the stack is empty, returns nullptr.
+  std::unique_ptr<ResidualBuffer> Pop();
+
+  // Swaps the contents of this stack and |other|.
+  void Swap(ResidualBufferStack* other);
+
+  // Returns the number of buffers in the stack.
+  size_t Size() const { return num_buffers_; }
+
+ private:
+  // A singly-linked list of ResidualBuffers, chained together using the next_
+  // field of ResidualBuffer.
+  ResidualBuffer* top_ = nullptr;
+  size_t num_buffers_ = 0;
+};
+
+// Utility class used to manage the residual buffers (and the transform
+// parameters) used for multi-threaded decoding. This class uses a stack to
+// store the buffers for better cache locality. Since buffers used more recently
+// are more likely to be in the cache. All functions in this class are
+// thread-safe.
+class ResidualBufferPool : public Allocable {
+ public:
+  ResidualBufferPool(bool use_128x128_superblock, int subsampling_x,
+                     int subsampling_y, size_t residual_size);
+
+  // Recomputes |buffer_size_| and invalidates the existing buffers if
+  // necessary.
+  void Reset(bool use_128x128_superblock, int subsampling_x, int subsampling_y,
+             size_t residual_size);
+  // Gets a residual buffer. The buffer is guaranteed to be large enough to
+  // store the residual values for one superblock whose parameters are the same
+  // as the constructor or the last call to Reset(). If there are free buffers
+  // in the stack, it returns one from the stack, otherwise a new buffer is
+  // allocated.
+  std::unique_ptr<ResidualBuffer> Get();
+  // Returns the |buffer| back to the pool (by appending it to the stack).
+  // Subsequent calls to Get() may re-use this buffer.
+  void Release(std::unique_ptr<ResidualBuffer> buffer);
+
+  // Used only in the tests. Returns the number of buffers in the stack.
+  size_t Size() const;
+
+ private:
+  mutable std::mutex mutex_;
+  ResidualBufferStack buffers_ LIBGAV1_GUARDED_BY(mutex_);
+  size_t buffer_size_;
+  int queue_size_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
diff --git a/src/scan_tables.inc b/src/scan_tables.inc
new file mode 100644
index 0000000..f7c9231
--- /dev/null
+++ b/src/scan_tables.inc
@@ -0,0 +1,440 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains all the scan order tables.
+
+constexpr uint16_t kDefaultScan4x4[16] = {0, 1,  4,  8,  5, 2,  3,  6,
+                                          9, 12, 13, 10, 7, 11, 14, 15};
+
+constexpr uint16_t kColumnScan4x4[16] = {0, 4, 8,  12, 1, 5, 9,  13,
+                                         2, 6, 10, 14, 3, 7, 11, 15};
+
+constexpr uint16_t kRowScan4x4[16] = {0, 1, 2,  3,  4,  5,  6,  7,
+                                      8, 9, 10, 11, 12, 13, 14, 15};
+
+constexpr uint16_t kDefaultScan4x8[32] = {
+    0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31};
+
+constexpr uint16_t kColumnScan4x8[32] = {
+    0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+    2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31};
+
+constexpr uint16_t kRowScan4x8[32] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x4[32] = {
+    0,  8, 1,  16, 9,  2, 24, 17, 10, 3, 25, 18, 11, 4,  26, 19,
+    12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31};
+
+constexpr uint16_t kColumnScan8x4[32] = {
+    0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+    4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31};
+
+constexpr uint16_t kRowScan8x4[32] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x8[64] = {
+    0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63};
+
+constexpr uint16_t kColumnScan8x8[64] = {
+    0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+    2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+    4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+    6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63};
+
+constexpr uint16_t kRowScan8x8[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x16[128] = {
+    0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+    5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+    21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+    30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+    39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+    96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+    105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+    114, 121, 87,  94,  101, 108, 115, 122, 95,  102, 109, 116, 123, 103, 110,
+    117, 124, 111, 118, 125, 119, 126, 127};
+
+constexpr uint16_t kColumnScan8x16[128] = {
+    0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+    1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+    2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+    3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+    4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+    5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+    6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+    7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127};
+
+constexpr uint16_t kRowScan8x16[128] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x8[128] = {
+    0,  16,  1,   32, 17,  2,   48,  33,  18, 3,  64,  49,  34,  19,  4,   80,
+    65, 50,  35,  20, 5,   96,  81,  66,  51, 36, 21,  6,   112, 97,  82,  67,
+    52, 37,  22,  7,  113, 98,  83,  68,  53, 38, 23,  8,   114, 99,  84,  69,
+    54, 39,  24,  9,  115, 100, 85,  70,  55, 40, 25,  10,  116, 101, 86,  71,
+    56, 41,  26,  11, 117, 102, 87,  72,  57, 42, 27,  12,  118, 103, 88,  73,
+    58, 43,  28,  13, 119, 104, 89,  74,  59, 44, 29,  14,  120, 105, 90,  75,
+    60, 45,  30,  15, 121, 106, 91,  76,  61, 46, 31,  122, 107, 92,  77,  62,
+    47, 123, 108, 93, 78,  63,  124, 109, 94, 79, 125, 110, 95,  126, 111, 127};
+
+constexpr uint16_t kColumnScan16x8[128] = {
+    0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+    2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+    4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+    6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+    8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+    10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+    12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+    14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127};
+
+constexpr uint16_t kRowScan16x8[128] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x16[256] = {
+    0,   1,   16,  32,  17,  2,   3,   18,  33,  48,  64,  49,  34,  19,  4,
+    5,   20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,  6,   7,   22,
+    37,  52,  67,  82,  97,  112, 128, 113, 98,  83,  68,  53,  38,  23,  8,
+    9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 160, 145, 130, 115, 100,
+    85,  70,  55,  40,  25,  10,  11,  26,  41,  56,  71,  86,  101, 116, 131,
+    146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87,  72,  57,  42,  27,
+    12,  13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+    224, 209, 194, 179, 164, 149, 134, 119, 104, 89,  74,  59,  44,  29,  14,
+    15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+    240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91,  76,  61,  46,
+    31,  47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+    243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,  78,  63,  79,  94,
+    109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+    170, 155, 140, 125, 110, 95,  111, 126, 141, 156, 171, 186, 201, 216, 231,
+    246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+    218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+    255};
+
+constexpr uint16_t kColumnScan16x16[256] = {
+    0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+    1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+    2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+    3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+    4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+    5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+    6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+    7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+    8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+    9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+    10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+    11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+    12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+    13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+    14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+    15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255};
+
+constexpr uint16_t kRowScan16x16[256] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+    135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+    150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+    165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+    180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+    195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+    210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+    225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+    255};
+
+constexpr uint16_t kDefaultScan16x32[512] = {
+    0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
+    5,   20,  35,  50,  65,  80,  6,   21,  36,  51,  66,  81,  96,  7,   22,
+    37,  52,  67,  82,  97,  112, 8,   23,  38,  53,  68,  83,  98,  113, 128,
+    9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 10,  25,  40,  55,  70,
+    85,  100, 115, 130, 145, 160, 11,  26,  41,  56,  71,  86,  101, 116, 131,
+    146, 161, 176, 12,  27,  42,  57,  72,  87,  102, 117, 132, 147, 162, 177,
+    192, 13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+    14,  29,  44,  59,  74,  89,  104, 119, 134, 149, 164, 179, 194, 209, 224,
+    15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+    240, 31,  46,  61,  76,  91,  106, 121, 136, 151, 166, 181, 196, 211, 226,
+    241, 256, 47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227,
+    242, 257, 272, 63,  78,  93,  108, 123, 138, 153, 168, 183, 198, 213, 228,
+    243, 258, 273, 288, 79,  94,  109, 124, 139, 154, 169, 184, 199, 214, 229,
+    244, 259, 274, 289, 304, 95,  110, 125, 140, 155, 170, 185, 200, 215, 230,
+    245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+    246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+    247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+    248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+    249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+    250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+    251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+    252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+    253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+    254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+    255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+    480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+    481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+    482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+    498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+    350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+    411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+    487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+    459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+    491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+    510, 511};
+
+constexpr uint16_t kDefaultScan32x16[512] = {
+    0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+    160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+    162, 131, 100, 69,  38,  7,   256, 225, 194, 163, 132, 101, 70,  39,  8,
+    288, 257, 226, 195, 164, 133, 102, 71,  40,  9,   320, 289, 258, 227, 196,
+    165, 134, 103, 72,  41,  10,  352, 321, 290, 259, 228, 197, 166, 135, 104,
+    73,  42,  11,  384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74,  43,
+    12,  416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75,  44,  13,
+    448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76,  45,  14,
+    480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77,  46,
+    15,  481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+    47,  16,  482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+    79,  48,  17,  483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+    111, 80,  49,  18,  484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+    143, 112, 81,  50,  19,  485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+    175, 144, 113, 82,  51,  20,  486, 455, 424, 393, 362, 331, 300, 269, 238,
+    207, 176, 145, 114, 83,  52,  21,  487, 456, 425, 394, 363, 332, 301, 270,
+    239, 208, 177, 146, 115, 84,  53,  22,  488, 457, 426, 395, 364, 333, 302,
+    271, 240, 209, 178, 147, 116, 85,  54,  23,  489, 458, 427, 396, 365, 334,
+    303, 272, 241, 210, 179, 148, 117, 86,  55,  24,  490, 459, 428, 397, 366,
+    335, 304, 273, 242, 211, 180, 149, 118, 87,  56,  25,  491, 460, 429, 398,
+    367, 336, 305, 274, 243, 212, 181, 150, 119, 88,  57,  26,  492, 461, 430,
+    399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89,  58,  27,  493, 462,
+    431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90,  59,  28,  494,
+    463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91,  60,  29,
+    495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,  61,
+    30,  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+    62,  31,  497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+    94,  63,  498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+    95,  499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+    469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+    377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+    254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+    380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+    382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+    479, 511};
+
+constexpr uint16_t kDefaultScan32x32[1024] = {
+    0,    1,    32,   64,   33,   2,   3,    34,   65,   96,   128,  97,  66,
+    35,   4,    5,    36,   67,   98,  129,  160,  192,  161,  130,  99,  68,
+    37,   6,    7,    38,   69,   100, 131,  162,  193,  224,  256,  225, 194,
+    163,  132,  101,  70,   39,   8,   9,    40,   71,   102,  133,  164, 195,
+    226,  257,  288,  320,  289,  258, 227,  196,  165,  134,  103,  72,  41,
+    10,   11,   42,   73,   104,  135, 166,  197,  228,  259,  290,  321, 352,
+    384,  353,  322,  291,  260,  229, 198,  167,  136,  105,  74,   43,  12,
+    13,   44,   75,   106,  137,  168, 199,  230,  261,  292,  323,  354, 385,
+    416,  448,  417,  386,  355,  324, 293,  262,  231,  200,  169,  138, 107,
+    76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,  263, 294,
+    325,  356,  387,  418,  449,  480, 512,  481,  450,  419,  388,  357, 326,
+    295,  264,  233,  202,  171,  140, 109,  78,   47,   16,   17,   48,  79,
+    110,  141,  172,  203,  234,  265, 296,  327,  358,  389,  420,  451, 482,
+    513,  544,  576,  545,  514,  483, 452,  421,  390,  359,  328,  297, 266,
+    235,  204,  173,  142,  111,  80,  49,   18,   19,   50,   81,   112, 143,
+    174,  205,  236,  267,  298,  329, 360,  391,  422,  453,  484,  515, 546,
+    577,  608,  640,  609,  578,  547, 516,  485,  454,  423,  392,  361, 330,
+    299,  268,  237,  206,  175,  144, 113,  82,   51,   20,   21,   52,  83,
+    114,  145,  176,  207,  238,  269, 300,  331,  362,  393,  424,  455, 486,
+    517,  548,  579,  610,  641,  672, 704,  673,  642,  611,  580,  549, 518,
+    487,  456,  425,  394,  363,  332, 301,  270,  239,  208,  177,  146, 115,
+    84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271, 302,
+    333,  364,  395,  426,  457,  488, 519,  550,  581,  612,  643,  674, 705,
+    736,  768,  737,  706,  675,  644, 613,  582,  551,  520,  489,  458, 427,
+    396,  365,  334,  303,  272,  241, 210,  179,  148,  117,  86,   55,  24,
+    25,   56,   87,   118,  149,  180, 211,  242,  273,  304,  335,  366, 397,
+    428,  459,  490,  521,  552,  583, 614,  645,  676,  707,  738,  769, 800,
+    832,  801,  770,  739,  708,  677, 646,  615,  584,  553,  522,  491, 460,
+    429,  398,  367,  336,  305,  274, 243,  212,  181,  150,  119,  88,  57,
+    26,   27,   58,   89,   120,  151, 182,  213,  244,  275,  306,  337, 368,
+    399,  430,  461,  492,  523,  554, 585,  616,  647,  678,  709,  740, 771,
+    802,  833,  864,  896,  865,  834, 803,  772,  741,  710,  679,  648, 617,
+    586,  555,  524,  493,  462,  431, 400,  369,  338,  307,  276,  245, 214,
+    183,  152,  121,  90,   59,   28,  29,   60,   91,   122,  153,  184, 215,
+    246,  277,  308,  339,  370,  401, 432,  463,  494,  525,  556,  587, 618,
+    649,  680,  711,  742,  773,  804, 835,  866,  897,  928,  960,  929, 898,
+    867,  836,  805,  774,  743,  712, 681,  650,  619,  588,  557,  526, 495,
+    464,  433,  402,  371,  340,  309, 278,  247,  216,  185,  154,  123, 92,
+    61,   30,   31,   62,   93,   124, 155,  186,  217,  248,  279,  310, 341,
+    372,  403,  434,  465,  496,  527, 558,  589,  620,  651,  682,  713, 744,
+    775,  806,  837,  868,  899,  930, 961,  992,  993,  962,  931,  900, 869,
+    838,  807,  776,  745,  714,  683, 652,  621,  590,  559,  528,  497, 466,
+    435,  404,  373,  342,  311,  280, 249,  218,  187,  156,  125,  94,  63,
+    95,   126,  157,  188,  219,  250, 281,  312,  343,  374,  405,  436, 467,
+    498,  529,  560,  591,  622,  653, 684,  715,  746,  777,  808,  839, 870,
+    901,  932,  963,  994,  995,  964, 933,  902,  871,  840,  809,  778, 747,
+    716,  685,  654,  623,  592,  561, 530,  499,  468,  437,  406,  375, 344,
+    313,  282,  251,  220,  189,  158, 127,  159,  190,  221,  252,  283, 314,
+    345,  376,  407,  438,  469,  500, 531,  562,  593,  624,  655,  686, 717,
+    748,  779,  810,  841,  872,  903, 934,  965,  996,  997,  966,  935, 904,
+    873,  842,  811,  780,  749,  718, 687,  656,  625,  594,  563,  532, 501,
+    470,  439,  408,  377,  346,  315, 284,  253,  222,  191,  223,  254, 285,
+    316,  347,  378,  409,  440,  471, 502,  533,  564,  595,  626,  657, 688,
+    719,  750,  781,  812,  843,  874, 905,  936,  967,  998,  999,  968, 937,
+    906,  875,  844,  813,  782,  751, 720,  689,  658,  627,  596,  565, 534,
+    503,  472,  441,  410,  379,  348, 317,  286,  255,  287,  318,  349, 380,
+    411,  442,  473,  504,  535,  566, 597,  628,  659,  690,  721,  752, 783,
+    814,  845,  876,  907,  938,  969, 1000, 1001, 970,  939,  908,  877, 846,
+    815,  784,  753,  722,  691,  660, 629,  598,  567,  536,  505,  474, 443,
+    412,  381,  350,  319,  351,  382, 413,  444,  475,  506,  537,  568, 599,
+    630,  661,  692,  723,  754,  785, 816,  847,  878,  909,  940,  971, 1002,
+    1003, 972,  941,  910,  879,  848, 817,  786,  755,  724,  693,  662, 631,
+    600,  569,  538,  507,  476,  445, 414,  383,  415,  446,  477,  508, 539,
+    570,  601,  632,  663,  694,  725, 756,  787,  818,  849,  880,  911, 942,
+    973,  1004, 1005, 974,  943,  912, 881,  850,  819,  788,  757,  726, 695,
+    664,  633,  602,  571,  540,  509, 478,  447,  479,  510,  541,  572, 603,
+    634,  665,  696,  727,  758,  789, 820,  851,  882,  913,  944,  975, 1006,
+    1007, 976,  945,  914,  883,  852, 821,  790,  759,  728,  697,  666, 635,
+    604,  573,  542,  511,  543,  574, 605,  636,  667,  698,  729,  760, 791,
+    822,  853,  884,  915,  946,  977, 1008, 1009, 978,  947,  916,  885, 854,
+    823,  792,  761,  730,  699,  668, 637,  606,  575,  607,  638,  669, 700,
+    731,  762,  793,  824,  855,  886, 917,  948,  979,  1010, 1011, 980, 949,
+    918,  887,  856,  825,  794,  763, 732,  701,  670,  639,  671,  702, 733,
+    764,  795,  826,  857,  888,  919, 950,  981,  1012, 1013, 982,  951, 920,
+    889,  858,  827,  796,  765,  734, 703,  735,  766,  797,  828,  859, 890,
+    921,  952,  983,  1014, 1015, 984, 953,  922,  891,  860,  829,  798, 767,
+    799,  830,  861,  892,  923,  954, 985,  1016, 1017, 986,  955,  924, 893,
+    862,  831,  863,  894,  925,  956, 987,  1018, 1019, 988,  957,  926, 895,
+    927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023};
+
+constexpr uint16_t kDefaultScan4x16[64] = {
+    0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+    33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+    49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63};
+
+constexpr uint16_t kColumnScan4x16[64] = {
+    0, 4, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+    1, 5, 9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+    2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+    3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63};
+
+constexpr uint16_t kRowScan4x16[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan16x4[64] = {
+    0,  16, 1,  32, 17, 2,  48, 33, 18, 3,  49, 34, 19, 4,  50, 35,
+    20, 5,  51, 36, 21, 6,  52, 37, 22, 7,  53, 38, 23, 8,  54, 39,
+    24, 9,  55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+    28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63};
+
+constexpr uint16_t kColumnScan16x4[64] = {
+    0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+    4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+    8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+    12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
+
+constexpr uint16_t kRowScan16x4[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x32[256] = {
+    0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+    5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+    21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+    30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+    39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+    96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+    105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+    114, 121, 128, 87,  94,  101, 108, 115, 122, 129, 136, 95,  102, 109, 116,
+    123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+    132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+    141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+    150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+    159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+    216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+    225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+    234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+    250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+    255};
+
+constexpr uint16_t kDefaultScan32x8[256] = {
+    0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+    160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+    162, 131, 100, 69,  38,  7,   225, 194, 163, 132, 101, 70,  39,  8,   226,
+    195, 164, 133, 102, 71,  40,  9,   227, 196, 165, 134, 103, 72,  41,  10,
+    228, 197, 166, 135, 104, 73,  42,  11,  229, 198, 167, 136, 105, 74,  43,
+    12,  230, 199, 168, 137, 106, 75,  44,  13,  231, 200, 169, 138, 107, 76,
+    45,  14,  232, 201, 170, 139, 108, 77,  46,  15,  233, 202, 171, 140, 109,
+    78,  47,  16,  234, 203, 172, 141, 110, 79,  48,  17,  235, 204, 173, 142,
+    111, 80,  49,  18,  236, 205, 174, 143, 112, 81,  50,  19,  237, 206, 175,
+    144, 113, 82,  51,  20,  238, 207, 176, 145, 114, 83,  52,  21,  239, 208,
+    177, 146, 115, 84,  53,  22,  240, 209, 178, 147, 116, 85,  54,  23,  241,
+    210, 179, 148, 117, 86,  55,  24,  242, 211, 180, 149, 118, 87,  56,  25,
+    243, 212, 181, 150, 119, 88,  57,  26,  244, 213, 182, 151, 120, 89,  58,
+    27,  245, 214, 183, 152, 121, 90,  59,  28,  246, 215, 184, 153, 122, 91,
+    60,  29,  247, 216, 185, 154, 123, 92,  61,  30,  248, 217, 186, 155, 124,
+    93,  62,  31,  249, 218, 187, 156, 125, 94,  63,  250, 219, 188, 157, 126,
+    95,  251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+    255};
+
+// 5.11.41 (implemented as a simple look up of transform class and transform
+// size).
+const uint16_t* kScan[3][kNumTransformSizes] = {
+    // kTransformClass2D
+    {kDefaultScan4x4, kDefaultScan4x8, kDefaultScan4x16, kDefaultScan8x4,
+     kDefaultScan8x8, kDefaultScan8x16, kDefaultScan8x32, kDefaultScan16x4,
+     kDefaultScan16x8, kDefaultScan16x16, kDefaultScan16x32, kDefaultScan16x32,
+     kDefaultScan32x8, kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32,
+     kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+    // kTransformClassHorizontal
+    {kColumnScan4x4, kColumnScan4x8, kColumnScan4x16, kColumnScan8x4,
+     kColumnScan8x8, kColumnScan8x16, kColumnScan16x4, kColumnScan16x4,
+     kColumnScan16x8, kColumnScan16x16, kColumnScan16x4, kDefaultScan16x32,
+     kColumnScan16x4, kColumnScan16x4, kColumnScan16x4, kDefaultScan32x32,
+     kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+    // kTransformClassVertical
+    {kRowScan4x4, kRowScan4x8, kRowScan4x16, kRowScan8x4, kRowScan8x8,
+     kRowScan8x16, kRowScan16x4, kRowScan16x4, kRowScan16x8, kRowScan16x16,
+     kRowScan16x4, kDefaultScan16x32, kRowScan16x4, kRowScan16x4, kRowScan16x4,
+     kDefaultScan32x32, kDefaultScan32x16, kDefaultScan32x32,
+     kDefaultScan32x32}};
diff --git a/src/status_code.cc b/src/status_code.cc
new file mode 100644
index 0000000..34def08
--- /dev/null
+++ b/src/status_code.cc
@@ -0,0 +1,57 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/status_code.h"
+
+extern "C" {
+
+const char* Libgav1GetErrorString(Libgav1StatusCode status) {
+  switch (status) {
+    case kLibgav1StatusOk:
+      return "Success.";
+    case kLibgav1StatusUnknownError:
+      return "Unknown error.";
+    case kLibgav1StatusInvalidArgument:
+      return "Invalid function argument.";
+    case kLibgav1StatusOutOfMemory:
+      return "Memory allocation failure.";
+    case kLibgav1StatusResourceExhausted:
+      return "Ran out of a resource (other than memory).";
+    case kLibgav1StatusNotInitialized:
+      return "The object is not initialized.";
+    case kLibgav1StatusAlready:
+      return "An operation that can only be performed once has already been "
+             "performed.";
+    case kLibgav1StatusUnimplemented:
+      return "Not implemented.";
+    case kLibgav1StatusInternalError:
+      return "Internal error in libgav1.";
+    case kLibgav1StatusBitstreamError:
+      return "The bitstream is not encoded correctly or violates a bitstream "
+             "conformance requirement.";
+    case kLibgav1StatusTryAgain:
+      return "The operation is not allowed at the moment. Try again later.";
+    case kLibgav1StatusNothingToDequeue:
+      return "There are no enqueued frames, so there is nothing to dequeue. "
+             "Try enqueuing a frame before trying to dequeue again.";
+    // This switch statement does not have a default case. This way the compiler
+    // will warn if we neglect to update this function after adding a new value
+    // to the Libgav1StatusCode enum type.
+    case kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_:
+      break;
+  }
+  return "Unrecognized status code.";
+}
+
+}  // extern "C"
diff --git a/src/symbol_decoder_context.cc b/src/symbol_decoder_context.cc
new file mode 100644
index 0000000..26a281e
--- /dev/null
+++ b/src/symbol_decoder_context.cc
@@ -0,0 +1,322 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/symbol_decoder_context.h"
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/symbol_decoder_context_cdfs.inc"
+
+uint8_t GetQuantizerContext(int base_quantizer_index) {
+  if (base_quantizer_index <= 20) return 0;
+  if (base_quantizer_index <= 60) return 1;
+  if (base_quantizer_index <= 120) return 2;
+  return 3;
+}
+
+// Reset*Counters() are helper functions to reset the CDF arrays where the
+// counters are not in the last element of the innermost dimension.
+
+void ResetPartitionCounters(SymbolDecoderContext* const context) {
+  int block_size_log2 = k4x4WidthLog2[kBlock8x8];
+  for (auto& d1 : context->partition_cdf) {
+    const int cdf_size =
+        SymbolDecoderContext::PartitionCdfSize(block_size_log2++);
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetPaletteColorIndexCounters(SymbolDecoderContext* const context) {
+  for (auto& d1 : context->palette_color_index_cdf) {
+    int cdf_size = kMinPaletteSize;
+    for (auto& d2 : d1) {
+      for (auto& d3 : d2) {
+        d3[cdf_size] = 0;
+      }
+      ++cdf_size;
+    }
+  }
+}
+
+void ResetTxTypeCounters(SymbolDecoderContext* const context) {
+  int set_index = kTransformSetIntra1;
+  for (auto& d1 : context->intra_tx_type_cdf) {
+    const int cdf_size = kNumTransformTypesInSet[set_index++];
+    for (auto& d2 : d1) {
+      for (auto& d3 : d2) {
+        d3[cdf_size] = 0;
+      }
+    }
+  }
+  for (auto& d1 : context->inter_tx_type_cdf) {
+    const int cdf_size = kNumTransformTypesInSet[set_index++];
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetTxDepthCounters(SymbolDecoderContext* const context) {
+  int delta = 1;
+  for (auto& d1 : context->tx_depth_cdf) {
+    const int cdf_size = kMaxTxDepthSymbolCount - delta;
+    delta = 0;
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetUVModeCounters(SymbolDecoderContext* const context) {
+  int cdf_size = kIntraPredictionModesUV - 1;
+  for (auto& d1 : context->uv_mode_cdf) {
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+    ++cdf_size;
+  }
+}
+
+}  // namespace
+
+#define CDF_COPY(source, destination)                       \
+  static_assert(sizeof(source) == sizeof(destination), ""); \
+  memcpy(destination, source, sizeof(source))
+
+void SymbolDecoderContext::Initialize(int base_quantizer_index) {
+  CDF_COPY(kDefaultPartitionCdf, partition_cdf);
+  CDF_COPY(kDefaultSkipCdf, skip_cdf);
+  CDF_COPY(kDefaultSkipModeCdf, skip_mode_cdf);
+  CDF_COPY(kDefaultSegmentIdCdf, segment_id_cdf);
+  CDF_COPY(kDefaultUsePredictedSegmentIdCdf, use_predicted_segment_id_cdf);
+  CDF_COPY(kDefaultDeltaQCdf, delta_q_cdf);
+  CDF_COPY(kDefaultDeltaQCdf, delta_lf_cdf);
+  for (auto& delta_lf_multi_cdf_entry : delta_lf_multi_cdf) {
+    CDF_COPY(kDefaultDeltaQCdf, delta_lf_multi_cdf_entry);
+  }
+  CDF_COPY(kDefaultIntraBlockCopyCdf, intra_block_copy_cdf);
+  CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+  CDF_COPY(kDefaultYModeCdf, y_mode_cdf);
+  CDF_COPY(kDefaultAngleDeltaCdf, angle_delta_cdf);
+  CDF_COPY(kDefaultUVModeCdf, uv_mode_cdf);
+  CDF_COPY(kDefaultCflAlphaSignsCdf, cfl_alpha_signs_cdf);
+  CDF_COPY(kDefaultCflAlphaCdf, cfl_alpha_cdf);
+  CDF_COPY(kDefaultUseFilterIntraCdf, use_filter_intra_cdf);
+  CDF_COPY(kDefaultFilterIntraModeCdf, filter_intra_mode_cdf);
+  CDF_COPY(kDefaultTxDepthCdf, tx_depth_cdf);
+  CDF_COPY(kDefaultTxSplitCdf, tx_split_cdf);
+  CDF_COPY(kDefaultInterTxTypeCdf, inter_tx_type_cdf);
+  CDF_COPY(kDefaultIntraTxTypeCdf, intra_tx_type_cdf);
+  CDF_COPY(kDefaultRestorationTypeCdf, restoration_type_cdf);
+  CDF_COPY(kDefaultUseWienerCdf, use_wiener_cdf);
+  CDF_COPY(kDefaultUseSgrProjCdf, use_sgrproj_cdf);
+  CDF_COPY(kDefaultHasPaletteYCdf, has_palette_y_cdf);
+  CDF_COPY(kDefaultPaletteYSizeCdf, palette_y_size_cdf);
+  CDF_COPY(kDefaultHasPaletteUVCdf, has_palette_uv_cdf);
+  CDF_COPY(kDefaultPaletteUVSizeCdf, palette_uv_size_cdf);
+  CDF_COPY(kDefaultPaletteColorIndexCdf, palette_color_index_cdf);
+  CDF_COPY(kDefaultIsInterCdf, is_inter_cdf);
+  CDF_COPY(kDefaultUseCompoundReferenceCdf, use_compound_reference_cdf);
+  CDF_COPY(kDefaultCompoundReferenceTypeCdf, compound_reference_type_cdf);
+  CDF_COPY(kDefaultCompoundReferenceCdf, compound_reference_cdf);
+  CDF_COPY(kDefaultCompoundBackwardReferenceCdf,
+           compound_backward_reference_cdf);
+  CDF_COPY(kDefaultSingleReferenceCdf, single_reference_cdf);
+  CDF_COPY(kDefaultCompoundPredictionModeCdf, compound_prediction_mode_cdf);
+  CDF_COPY(kDefaultNewMvCdf, new_mv_cdf);
+  CDF_COPY(kDefaultZeroMvCdf, zero_mv_cdf);
+  CDF_COPY(kDefaultReferenceMvCdf, reference_mv_cdf);
+  CDF_COPY(kDefaultRefMvIndexCdf, ref_mv_index_cdf);
+  CDF_COPY(kDefaultIsInterIntraCdf, is_inter_intra_cdf);
+  CDF_COPY(kDefaultInterIntraModeCdf, inter_intra_mode_cdf);
+  CDF_COPY(kDefaultIsWedgeInterIntraCdf, is_wedge_inter_intra_cdf);
+  CDF_COPY(kDefaultWedgeIndexCdf, wedge_index_cdf);
+  CDF_COPY(kDefaultUseObmcCdf, use_obmc_cdf);
+  CDF_COPY(kDefaultMotionModeCdf, motion_mode_cdf);
+  CDF_COPY(kDefaultIsExplicitCompoundTypeCdf, is_explicit_compound_type_cdf);
+  CDF_COPY(kDefaultIsCompoundTypeAverageCdf, is_compound_type_average_cdf);
+  CDF_COPY(kDefaultCompoundTypeCdf, compound_type_cdf);
+  CDF_COPY(kDefaultInterpolationFilterCdf, interpolation_filter_cdf);
+  for (int i = 0; i < kMvContexts; ++i) {
+    CDF_COPY(kDefaultMvJointCdf, mv_joint_cdf[i]);
+    for (int j = 0; j < kNumMvComponents; ++j) {
+      CDF_COPY(kDefaultMvSignCdf, mv_sign_cdf[i][j]);
+      CDF_COPY(kDefaultMvClassCdf, mv_class_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0BitCdf, mv_class0_bit_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0FractionCdf, mv_class0_fraction_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0HighPrecisionCdf,
+               mv_class0_high_precision_cdf[i][j]);
+      CDF_COPY(kDefaultMvBitCdf, mv_bit_cdf[i][j]);
+      CDF_COPY(kDefaultMvFractionCdf, mv_fraction_cdf[i][j]);
+      CDF_COPY(kDefaultMvHighPrecisionCdf, mv_high_precision_cdf[i][j]);
+    }
+  }
+  const int quantizer_context = GetQuantizerContext(base_quantizer_index);
+  CDF_COPY(kDefaultAllZeroCdf[quantizer_context], all_zero_cdf);
+  CDF_COPY(kDefaultEobPt16Cdf[quantizer_context], eob_pt_16_cdf);
+  CDF_COPY(kDefaultEobPt32Cdf[quantizer_context], eob_pt_32_cdf);
+  CDF_COPY(kDefaultEobPt64Cdf[quantizer_context], eob_pt_64_cdf);
+  CDF_COPY(kDefaultEobPt128Cdf[quantizer_context], eob_pt_128_cdf);
+  CDF_COPY(kDefaultEobPt256Cdf[quantizer_context], eob_pt_256_cdf);
+  CDF_COPY(kDefaultEobPt512Cdf[quantizer_context], eob_pt_512_cdf);
+  CDF_COPY(kDefaultEobPt1024Cdf[quantizer_context], eob_pt_1024_cdf);
+  CDF_COPY(kDefaultEobExtraCdf[quantizer_context], eob_extra_cdf);
+  CDF_COPY(kDefaultCoeffBaseEobCdf[quantizer_context], coeff_base_eob_cdf);
+  CDF_COPY(kDefaultCoeffBaseCdf[quantizer_context], coeff_base_cdf);
+  CDF_COPY(kDefaultCoeffBaseRangeCdf[quantizer_context], coeff_base_range_cdf);
+  CDF_COPY(kDefaultDcSignCdf[quantizer_context], dc_sign_cdf);
+}
+
+void SymbolDecoderContext::ResetIntraFrameYModeCdf() {
+  CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+}
+
+#undef CDF_COPY
+
+// These macros set the last element in the inner-most dimension of the array to
+// zero.
+#define RESET_COUNTER_1D(array)                              \
+  do {                                                       \
+    (array)[std::extent<decltype(array), 0>::value - 1] = 0; \
+  } while (false)
+
+#define RESET_COUNTER_2D(array)                           \
+  do {                                                    \
+    for (auto& d1 : (array)) {                            \
+      d1[std::extent<decltype(array), 1>::value - 1] = 0; \
+    }                                                     \
+  } while (false)
+
+#define RESET_COUNTER_3D(array)                             \
+  do {                                                      \
+    for (auto& d1 : (array)) {                              \
+      for (auto& d2 : d1) {                                 \
+        d2[std::extent<decltype(array), 2>::value - 1] = 0; \
+      }                                                     \
+    }                                                       \
+  } while (false)
+
+#define RESET_COUNTER_4D(array)                               \
+  do {                                                        \
+    for (auto& d1 : (array)) {                                \
+      for (auto& d2 : d1) {                                   \
+        for (auto& d3 : d2) {                                 \
+          d3[std::extent<decltype(array), 3>::value - 1] = 0; \
+        }                                                     \
+      }                                                       \
+    }                                                         \
+  } while (false)
+
+void SymbolDecoderContext::ResetCounters() {
+  ResetPartitionCounters(this);
+  RESET_COUNTER_2D(segment_id_cdf);
+  RESET_COUNTER_2D(use_predicted_segment_id_cdf);
+  RESET_COUNTER_2D(skip_cdf);
+  RESET_COUNTER_2D(skip_mode_cdf);
+  RESET_COUNTER_1D(delta_q_cdf);
+  RESET_COUNTER_1D(delta_lf_cdf);
+  RESET_COUNTER_2D(delta_lf_multi_cdf);
+  RESET_COUNTER_1D(intra_block_copy_cdf);
+  RESET_COUNTER_3D(intra_frame_y_mode_cdf);
+  RESET_COUNTER_2D(y_mode_cdf);
+  RESET_COUNTER_2D(angle_delta_cdf);
+  ResetUVModeCounters(this);
+  RESET_COUNTER_1D(cfl_alpha_signs_cdf);
+  RESET_COUNTER_2D(cfl_alpha_cdf);
+  RESET_COUNTER_2D(use_filter_intra_cdf);
+  RESET_COUNTER_1D(filter_intra_mode_cdf);
+  ResetTxDepthCounters(this);
+  RESET_COUNTER_2D(tx_split_cdf);
+  RESET_COUNTER_3D(all_zero_cdf);
+  ResetTxTypeCounters(this);
+  RESET_COUNTER_3D(eob_pt_16_cdf);
+  RESET_COUNTER_3D(eob_pt_32_cdf);
+  RESET_COUNTER_3D(eob_pt_64_cdf);
+  RESET_COUNTER_3D(eob_pt_128_cdf);
+  RESET_COUNTER_3D(eob_pt_256_cdf);
+  RESET_COUNTER_2D(eob_pt_512_cdf);
+  RESET_COUNTER_2D(eob_pt_1024_cdf);
+  RESET_COUNTER_4D(eob_extra_cdf);
+  RESET_COUNTER_4D(coeff_base_eob_cdf);
+  RESET_COUNTER_4D(coeff_base_cdf);
+  RESET_COUNTER_4D(coeff_base_range_cdf);
+  RESET_COUNTER_3D(dc_sign_cdf);
+  RESET_COUNTER_1D(restoration_type_cdf);
+  RESET_COUNTER_1D(use_wiener_cdf);
+  RESET_COUNTER_1D(use_sgrproj_cdf);
+  RESET_COUNTER_3D(has_palette_y_cdf);
+  RESET_COUNTER_2D(palette_y_size_cdf);
+  RESET_COUNTER_2D(has_palette_uv_cdf);
+  RESET_COUNTER_2D(palette_uv_size_cdf);
+  ResetPaletteColorIndexCounters(this);
+  RESET_COUNTER_2D(is_inter_cdf);
+  RESET_COUNTER_2D(use_compound_reference_cdf);
+  RESET_COUNTER_2D(compound_reference_type_cdf);
+  RESET_COUNTER_4D(compound_reference_cdf);
+  RESET_COUNTER_3D(compound_backward_reference_cdf);
+  RESET_COUNTER_3D(single_reference_cdf);
+  RESET_COUNTER_2D(compound_prediction_mode_cdf);
+  RESET_COUNTER_2D(new_mv_cdf);
+  RESET_COUNTER_2D(zero_mv_cdf);
+  RESET_COUNTER_2D(reference_mv_cdf);
+  RESET_COUNTER_2D(ref_mv_index_cdf);
+  RESET_COUNTER_2D(is_inter_intra_cdf);
+  RESET_COUNTER_2D(inter_intra_mode_cdf);
+  RESET_COUNTER_2D(is_wedge_inter_intra_cdf);
+  RESET_COUNTER_2D(wedge_index_cdf);
+  RESET_COUNTER_2D(use_obmc_cdf);
+  RESET_COUNTER_2D(motion_mode_cdf);
+  RESET_COUNTER_2D(is_explicit_compound_type_cdf);
+  RESET_COUNTER_2D(is_compound_type_average_cdf);
+  RESET_COUNTER_2D(compound_type_cdf);
+  RESET_COUNTER_2D(interpolation_filter_cdf);
+  RESET_COUNTER_2D(mv_joint_cdf);
+  RESET_COUNTER_3D(mv_sign_cdf);
+  RESET_COUNTER_3D(mv_class_cdf);
+  RESET_COUNTER_3D(mv_class0_bit_cdf);
+  RESET_COUNTER_4D(mv_class0_fraction_cdf);
+  RESET_COUNTER_3D(mv_class0_high_precision_cdf);
+  RESET_COUNTER_4D(mv_bit_cdf);
+  RESET_COUNTER_3D(mv_fraction_cdf);
+  RESET_COUNTER_3D(mv_high_precision_cdf);
+}
+
+#undef RESET_COUNTER_1D
+#undef RESET_COUNTER_2D
+#undef RESET_COUNTER_3D
+#undef RESET_COUNTER_4D
+
+int SymbolDecoderContext::PartitionCdfSize(int block_size_log2) {
+  assert(block_size_log2 > 0);
+  assert(block_size_log2 < 6);
+
+  switch (block_size_log2) {
+    case 1:
+      return kPartitionSplit + 1;
+    case 5:
+      return kPartitionVerticalWithRightSplit + 1;
+    default:
+      return kMaxPartitionTypes;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/symbol_decoder_context.h b/src/symbol_decoder_context.h
new file mode 100644
index 0000000..1bea76c
--- /dev/null
+++ b/src/symbol_decoder_context.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+#define LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum {
+  kPartitionContexts = 4,
+  kSegmentIdContexts = 3,
+  kUsePredictedSegmentIdContexts = 3,
+  kSkipContexts = 3,
+  kSkipModeContexts = 3,
+  kBooleanFieldCdfSize = 3,
+  kDeltaSymbolCount = 4,  // Used for both delta_q and delta_lf.
+  kIntraModeContexts = 5,
+  kYModeContexts = 4,
+  kAngleDeltaSymbolCount = 2 * kMaxAngleDelta + 1,
+  kCflAlphaSignsSymbolCount = 8,
+  kCflAlphaContexts = 6,
+  kCflAlphaSymbolCount = 16,
+  kTxDepthContexts = 3,
+  kMaxTxDepthSymbolCount = 3,
+  kTxSplitContexts = 21,
+  kCoefficientQuantizerContexts = 4,
+  kNumSquareTransformSizes = 5,
+  kAllZeroContexts = 13,
+  kNumExtendedTransformSizes = 4,
+  kEobPtContexts = 2,
+  kEobPt16SymbolCount = 5,
+  kEobPt32SymbolCount = 6,
+  kEobPt64SymbolCount = 7,
+  kEobPt128SymbolCount = 8,
+  kEobPt256SymbolCount = 9,
+  kEobPt512SymbolCount = 10,
+  kEobPt1024SymbolCount = 11,
+  kEobExtraContexts = 9,
+  kCoeffBaseEobContexts = 4,
+  kCoeffBaseEobSymbolCount = 3,
+  kCoeffBaseContexts = 42,
+  kCoeffBaseSymbolCount = 4,
+  kCoeffBaseRangeContexts = 21,
+  kCoeffBaseRangeSymbolCount = 4,
+  kDcSignContexts = 3,
+  kPaletteBlockSizeContexts = 7,
+  kPaletteYModeContexts = 3,
+  kPaletteUVModeContexts = 2,
+  kPaletteSizeSymbolCount = 7,
+  kPaletteColorIndexContexts = 5,
+  kPaletteColorIndexSymbolCount = 8,
+  kIsInterContexts = 4,
+  kUseCompoundReferenceContexts = 5,
+  kCompoundReferenceTypeContexts = 5,
+  kReferenceContexts = 3,
+  kCompoundPredictionModeContexts = 8,
+  kNewMvContexts = 6,
+  kZeroMvContexts = 2,
+  kReferenceMvContexts = 6,
+  kRefMvIndexContexts = 3,
+  kInterIntraContexts = 3,
+  kWedgeIndexSymbolCount = 16,
+  kIsExplicitCompoundTypeContexts = 6,
+  kIsCompoundTypeAverageContexts = 6,
+  kInterpolationFilterContexts = 16,
+  kMvContexts = 2,
+  kMvClassSymbolCount = 11,
+  kMvFractionSymbolCount = 4,
+  kMvBitSymbolCount = 10,
+  kNumMvComponents = 2,
+};  // anonymous enum
+
+struct SymbolDecoderContext {
+  SymbolDecoderContext() = default;
+  explicit SymbolDecoderContext(int base_quantizer_index) {
+    Initialize(base_quantizer_index);
+  }
+
+  void Initialize(int base_quantizer_index);
+
+  // Partition related variables and functions.
+  static int PartitionCdfSize(int block_size_log2);
+
+  // Returns the cdf array index for inter_tx_type or intra_tx_type based on
+  // |tx_set|.
+  static int TxTypeIndex(TransformSet tx_set) {
+    assert(tx_set != kTransformSetDctOnly);
+    switch (tx_set) {
+      case kTransformSetInter1:
+      case kTransformSetIntra1:
+        return 0;
+      case kTransformSetInter2:
+      case kTransformSetIntra2:
+        return 1;
+      case kTransformSetInter3:
+        return 2;
+      default:
+        return -1;
+    }
+  }
+
+  // Resets the intra_frame_y_mode_cdf array to the default.
+  void ResetIntraFrameYModeCdf();
+
+  // Resets the symbol counters of all the CDF arrays to zero. Symbol counter is
+  // the last used element in the innermost dimension of each of the CDF array.
+  void ResetCounters();
+
+  // Note kMaxAlignment allows for aligned instructions to be used in the
+  // copies done in Initialize().
+  alignas(kMaxAlignment) uint16_t
+      partition_cdf[kBlockWidthCount][kPartitionContexts]
+                   [kMaxPartitionTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
+                            [kIntraPredictionModesY + 1];
+  alignas(kMaxAlignment) uint16_t
+      y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
+  alignas(kMaxAlignment) uint16_t
+      angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
+                 [kIntraPredictionModesUV + 1];
+  alignas(kMaxAlignment) uint16_t
+      cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
+  alignas(kMaxAlignment) uint16_t
+      tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      inter_tx_type_cdf[3][kNumExtendedTransformSizes][kNumTransformTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      intra_tx_type_cdf[2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+                       [kNumTransformTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt16SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt32SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt64SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt128SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt256SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes][kEobExtraContexts]
+                   [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                        [kCoeffBaseEobContexts][kCoeffBaseEobSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                    [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                          [kCoeffBaseRangeContexts]
+                          [kCoeffBaseRangeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      restoration_type_cdf[kRestorationTypeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+                       [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      palette_y_size_cdf[kPaletteBlockSizeContexts]
+                        [kPaletteSizeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      palette_uv_size_cdf[kPaletteBlockSizeContexts]
+                         [kPaletteSizeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
+                             [kPaletteColorIndexContexts]
+                             [kPaletteColorIndexSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      use_compound_reference_cdf[kUseCompoundReferenceContexts]
+                                [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_reference_type_cdf[kCompoundReferenceTypeContexts]
+                                 [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_reference_cdf[kNumCompoundReferenceTypes][kReferenceContexts][3]
+                            [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_backward_reference_cdf[kReferenceContexts][2]
+                                     [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
+                                  [kNumCompoundInterPredictionModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
+                                   [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_type_cdf[kMaxBlockSizes]
+                       [kNumExplicitCompoundPredictionTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      interpolation_filter_cdf[kInterpolationFilterContexts]
+                              [kNumExplicitInterpolationFilters + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_bit_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_fraction_cdf[kMvContexts][kNumMvComponents][kBooleanSymbolCount]
+                            [kMvFractionSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
+                [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
+                                                 [kMvFractionSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+                           [kBooleanFieldCdfSize];
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
diff --git a/src/symbol_decoder_context_cdfs.inc b/src/symbol_decoder_context_cdfs.inc
new file mode 100644
index 0000000..509286f
--- /dev/null
+++ b/src/symbol_decoder_context_cdfs.inc
@@ -0,0 +1,2509 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the CDF constant
+// definitions from the symbol decoder context functions.
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPartitionCdf
+    [kBlockWidthCount][kPartitionContexts][kMaxPartitionTypes + 1] = {
+        // width 8
+        {{13636, 7258, 2376, 0, 0},
+         {18840, 12913, 4228, 0, 0},
+         {20246, 9089, 4139, 0, 0},
+         {22872, 13985, 6915, 0, 0}},
+        // width 16
+        {{17171, 11839, 8197, 6062, 5104, 3947, 3167, 2197, 866, 0, 0},
+         {24843, 21725, 15983, 10298, 8797, 7725, 6117, 4067, 2934, 0, 0},
+         {27354, 19499, 17657, 12280, 10408, 8268, 7231, 6432, 651, 0, 0},
+         {30106, 26406, 24154, 11908, 9715, 7990, 6332, 4939, 1597, 0, 0}},
+        // width 32
+        {{14306, 11848, 9644, 5121, 4541, 3719, 3249, 2590, 1224, 0, 0},
+         {25079, 23708, 20712, 7776, 7108, 6586, 5817, 4727, 3716, 0, 0},
+         {26753, 23759, 22706, 8224, 7359, 6223, 5697, 5242, 721, 0, 0},
+         {31374, 30560, 29972, 4154, 3707, 3302, 2928, 2583, 869, 0, 0}},
+        // width 64
+        {{12631, 11221, 9690, 3202, 2931, 2507, 2244, 1876, 1044, 0, 0},
+         {26036, 25278, 23271, 4824, 4518, 4253, 3799, 3138, 2664, 0, 0},
+         {26823, 25105, 24420, 4085, 3651, 3019, 2704, 2470, 530, 0, 0},
+         {31898, 31556, 31281, 1570, 1374, 1194, 1025, 887, 436, 0, 0}},
+        // width 128
+        {{4869, 4549, 4239, 284, 229, 149, 129, 0, 0},
+         {26161, 25778, 24500, 708, 549, 430, 397, 0, 0},
+         {27339, 26092, 25646, 741, 541, 237, 186, 0, 0},
+         {32057, 31802, 31596, 320, 230, 151, 104, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] = {
+        {27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
+        {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
+        {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUsePredictedSegmentIdCdf[kUsePredictedSegmentIdContexts]
+                                    [kBooleanFieldCdfSize] = {{16384, 0, 0},
+                                                              {16384, 0, 0},
+                                                              {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
+        {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSkipModeCdf[kSkipModeContexts][kBooleanFieldCdfSize] = {
+        {147, 0, 0}, {12060, 0, 0}, {24641, 0, 0}};
+
+// This constant is also used for DeltaLf and DeltaLfMulti.
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIntraFrameYModeCdf[kIntraModeContexts][kIntraModeContexts]
+                              [kIntraPredictionModesY + 1] = {
+                                  {{17180, 15741, 13430, 12550, 12086, 11658,
+                                    10943, 9524, 8579, 4603, 3675, 2302, 0, 0},
+                                   {20752, 14702, 13252, 12465, 12049, 11324,
+                                    10880, 9736, 8334, 4110, 2596, 1359, 0, 0},
+                                   {22716, 21997, 10472, 9980, 9713, 9529, 8635,
+                                    7148, 6608, 3432, 2839, 1201, 0, 0},
+                                   {18677, 17362, 16326, 13960, 13632, 13222,
+                                    12770, 10672, 8022, 3183, 1810, 306, 0, 0},
+                                   {20646, 19503, 17165, 16267, 14159, 12735,
+                                    10377, 7185, 6331, 2507, 1695, 293, 0, 0}},
+                                  {{22745, 13183, 11920, 11328, 10936, 10008,
+                                    9679, 8745, 7387, 3754, 2286, 1332, 0, 0},
+                                   {26785, 8669, 8208, 7882, 7702, 6973, 6855,
+                                    6345, 5158, 2863, 1492, 974, 0, 0},
+                                   {25324, 19987, 12591, 12040, 11691, 11161,
+                                    10598, 9363, 8299, 4853, 3678, 2276, 0, 0},
+                                   {24231, 18079, 17336, 15681, 15360, 14596,
+                                    14360, 12943, 8119, 3615, 1672, 558, 0, 0},
+                                   {25225, 18537, 17272, 16573, 14863, 12051,
+                                    10784, 8252, 6767, 3093, 1787, 774, 0, 0}},
+                                  {{20155, 19177, 11385, 10764, 10456, 10191,
+                                    9367, 7713, 7039, 3230, 2463, 691, 0, 0},
+                                   {23081, 19298, 14262, 13538, 13164, 12621,
+                                    12073, 10706, 9549, 5025, 3557, 1861, 0, 0},
+                                   {26585, 26263, 6744, 6516, 6402, 6334, 5686,
+                                    4414, 4213, 2301, 1974, 682, 0, 0},
+                                   {22050, 21034, 17814, 15544, 15203, 14844,
+                                    14207, 11245, 8890, 3793, 2481, 516, 0, 0},
+                                   {23574, 22910, 16267, 15505, 14344, 13597,
+                                    11205, 6807, 6207, 2696, 2031, 305, 0, 0}},
+                                  {{20166, 18369, 17280, 14387, 13990, 13453,
+                                    13044, 11349, 7708, 3072, 1851, 359, 0, 0},
+                                   {24565, 18947, 18244, 15663, 15329, 14637,
+                                    14364, 13300, 7543, 3283, 1610, 426, 0, 0},
+                                   {24317, 23037, 17764, 15125, 14756, 14343,
+                                    13698, 11230, 8163, 3650, 2690, 750, 0, 0},
+                                   {25054, 23720, 23252, 16101, 15951, 15774,
+                                    15615, 14001, 6025, 2379, 1232, 240, 0, 0},
+                                   {23925, 22488, 21272, 17451, 16116, 14825,
+                                    13660, 10050, 6999, 2815, 1785, 283, 0, 0}},
+                                  {{20190, 19097, 16789, 15934, 13693, 11855,
+                                    9779, 7319, 6549, 2554, 1618, 291, 0, 0},
+                                   {23205, 19142, 17688, 16876, 15012, 11905,
+                                    10561, 8532, 7388, 3115, 1625, 491, 0, 0},
+                                   {24412, 23867, 15152, 14512, 13418, 12662,
+                                    10170, 6821, 6302, 2868, 2245, 507, 0, 0},
+                                   {21933, 20953, 19644, 16726, 15750, 14729,
+                                    13821, 10015, 8153, 3279, 1885, 286, 0, 0},
+                                   {25150, 24480, 22909, 22259, 17382, 14111,
+                                    9865, 3992, 3588, 1413, 966, 175, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultYModeCdf[kYModeContexts][kIntraPredictionModesY + 1] = {
+        {9967, 9279, 8475, 8012, 7167, 6645, 6162, 5350, 4823, 3540, 3083, 2419,
+         0, 0},
+        {14095, 12923, 10137, 9450, 8818, 8119, 7241, 5404, 4616, 3067, 2784,
+         1916, 0, 0},
+        {12998, 11789, 9372, 8829, 8527, 8114, 7632, 5695, 4938, 3408, 3038,
+         2109, 0, 0},
+        {12613, 11467, 9930, 9590, 9507, 9235, 9065, 7964, 7416, 6193, 5752,
+         4719, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultAngleDeltaCdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1] =
+        {{30588, 27736, 25201, 9992, 5779, 2551, 0, 0},
+         {30467, 27160, 23967, 9281, 5794, 2438, 0, 0},
+         {28988, 21750, 19069, 13414, 9685, 1482, 0, 0},
+         {28187, 21542, 17621, 15630, 10934, 4371, 0, 0},
+         {31031, 21841, 18259, 13180, 10023, 3945, 0, 0},
+         {30104, 22592, 20283, 15118, 11168, 2273, 0, 0},
+         {30528, 21672, 17315, 12427, 10207, 3851, 0, 0},
+         {29163, 22340, 20309, 15092, 11524, 2113, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUVModeCdf[kBooleanSymbolCount][kIntraPredictionModesY]
+                     [kIntraPredictionModesUV + 1] = {
+                         // CFL not allowed.
+                         {{10137, 8616, 7390, 7107, 6782, 6248, 5713, 4845,
+                           4524, 2709, 1827, 807, 0, 0},
+                          {23255, 5887, 5795, 5722, 5650, 5104, 5029, 4944,
+                           4409, 3263, 2968, 972, 0, 0},
+                          {22923, 22853, 4105, 4064, 4011, 3988, 3570, 2946,
+                           2914, 2004, 991, 739, 0, 0},
+                          {19129, 18871, 18597, 7437, 7162, 7041, 6815, 5620,
+                           4191, 2156, 1413, 275, 0, 0},
+                          {23004, 22933, 22838, 22814, 7382, 5715, 4810, 4620,
+                           4525, 1667, 1024, 405, 0, 0},
+                          {20943, 19179, 19091, 19048, 17720, 3555, 3467, 3310,
+                           3057, 1607, 1327, 218, 0, 0},
+                          {18593, 18369, 16160, 15947, 15050, 14993, 4217, 2568,
+                           2523, 931, 426, 101, 0, 0},
+                          {19883, 19730, 17790, 17178, 17095, 17020, 16592,
+                           3640, 3501, 2125, 807, 307, 0, 0},
+                          {20742, 19107, 18894, 17463, 17278, 17042, 16773,
+                           16495, 4325, 2380, 2001, 352, 0, 0},
+                          {13716, 12928, 12189, 11852, 11618, 11301, 10883,
+                           10049, 9594, 3907, 2389, 593, 0, 0},
+                          {14141, 13119, 11794, 11549, 11276, 10952, 10569,
+                           9649, 9241, 5715, 1371, 620, 0, 0},
+                          {15742, 13764, 12771, 12429, 12182, 11665, 11419,
+                           10861, 10286, 6872, 6227, 949, 0, 0},
+                          {20644, 19009, 17809, 17776, 17761, 17717, 17690,
+                           17602, 17513, 17015, 16729, 16162, 0, 0}},
+                         // CFL allowed.
+                         {{22361, 21560, 19868, 19587, 18945, 18593, 17869,
+                           17112, 16782, 12682, 11773, 10313, 8556, 0, 0},
+                          {28236, 12988, 12711, 12553, 12340, 11697, 11569,
+                           11317, 10669, 8540, 8075, 5736, 3296, 0, 0},
+                          {27495, 27389, 12591, 12498, 12383, 12329, 11819,
+                           11073, 10994, 9630, 8512, 8065, 6089, 0, 0},
+                          {26028, 25601, 25106, 18616, 18232, 17983, 17734,
+                           16027, 14397, 11248, 10562, 9379, 8586, 0, 0},
+                          {27781, 27400, 26840, 26700, 13654, 12453, 10911,
+                           10515, 10357, 7857, 7388, 6741, 6392, 0, 0},
+                          {27398, 25879, 25521, 25375, 23270, 11654, 11366,
+                           11015, 10787, 7988, 7382, 6251, 5592, 0, 0},
+                          {27952, 27807, 25564, 25442, 24003, 23838, 12599,
+                           12086, 11965, 9580, 9005, 8313, 7828, 0, 0},
+                          {26160, 26028, 24239, 23719, 23511, 23412, 23033,
+                           13941, 13709, 10432, 9564, 8804, 7975, 0, 0},
+                          {26770, 25349, 24987, 23835, 23513, 23219, 23015,
+                           22351, 13870, 10274, 9629, 8004, 6779, 0, 0},
+                          {22108, 21470, 20218, 19811, 19446, 19144, 18728,
+                           17764, 17234, 12054, 10979, 9325, 7907, 0, 0},
+                          {22246, 21238, 20216, 19805, 19390, 18989, 18523,
+                           17533, 16866, 12666, 10072, 8994, 6930, 0, 0},
+                          {22669, 22077, 20129, 19719, 19382, 19103, 18643,
+                           17605, 17132, 13092, 12294, 9249, 7560, 0, 0},
+                          {29624, 27681, 25386, 25264, 25175, 25078, 24967,
+                           24704, 24536, 23520, 22893, 22247, 3720, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
+        31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1] = {
+        {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44,
+         0, 0},
+        {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88,
+         84, 0, 0},
+        {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75,
+         71, 0, 0},
+        {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0,
+         0},
+        {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192,
+         175, 146, 0, 0},
+        {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174,
+         146, 112, 108, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseFilterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {28147, 0, 0}, {26025, 0, 0}, {19998, 0, 0}, {26875, 0, 0},
+        {24902, 0, 0}, {20217, 0, 0}, {12539, 0, 0}, {22400, 0, 0},
+        {23374, 0, 0}, {20360, 0, 0}, {18467, 0, 0}, {16384, 0, 0},
+        {14667, 0, 0}, {20012, 0, 0}, {10425, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
+        23819, 19992, 15557, 3210, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultTxDepthCdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1] = {
+        {{12800, 0, 0}, {12800, 0, 0}, {8448, 0, 0}},
+        {{20496, 2596, 0, 0}, {20496, 2596, 0, 0}, {14091, 1920, 0, 0}},
+        {{19782, 17588, 0, 0}, {19782, 17588, 0, 0}, {8466, 7166, 0, 0}},
+        {{26986, 21293, 0, 0}, {26986, 21293, 0, 0}, {15965, 10009, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] = {
+        {4187, 0, 0},  {8922, 0, 0},  {11921, 0, 0}, {8453, 0, 0},
+        {14572, 0, 0}, {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0},
+        {21763, 0, 0}, {5589, 0, 0},  {12764, 0, 0}, {21487, 0, 0},
+        {6219, 0, 0},  {13460, 0, 0}, {18544, 0, 0}, {4753, 0, 0},
+        {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0},  {10367, 0, 0},
+        {16680, 0, 0}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
+                                 [kNumSquareTransformSizes][kAllZeroContexts]
+                                 [kBooleanFieldCdfSize] = {
+  {
+    {{919, 0, 0}, {26876, 0, 0}, {20656, 0, 0}, {10833, 0, 0}, {12479, 0, 0},
+     {5295, 0, 0}, {281, 0, 0}, {25114, 0, 0}, {13295, 0, 0}, {2784, 0, 0},
+     {22807, 0, 0}, {2526, 0, 0}, {651, 0, 0}},
+    {{1220, 0, 0}, {31219, 0, 0}, {22638, 0, 0}, {16112, 0, 0}, {14177, 0, 0},
+     {6460, 0, 0}, {231, 0, 0}, {27365, 0, 0}, {14672, 0, 0}, {2765, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{2811, 0, 0}, {27377, 0, 0}, {14729, 0, 0}, {9202, 0, 0}, {10337, 0, 0},
+     {6946, 0, 0}, {571, 0, 0}, {28990, 0, 0}, {17432, 0, 0}, {3787, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{14848, 0, 0}, {30950, 0, 0}, {25486, 0, 0}, {7495, 0, 0}, {21845, 0, 0},
+     {1214, 0, 0}, {144, 0, 0}, {31402, 0, 0}, {17140, 0, 0}, {2306, 0, 0},
+     {32622, 0, 0}, {27636, 0, 0}, {1111, 0, 0}},
+    {{26460, 0, 0}, {32651, 0, 0}, {31130, 0, 0}, {30607, 0, 0}, {16384, 0, 0},
+     {21845, 0, 0}, {2521, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{2397, 0, 0}, {25198, 0, 0}, {19613, 0, 0}, {12017, 0, 0}, {11799, 0, 0},
+     {5701, 0, 0}, {755, 0, 0}, {27273, 0, 0}, {14826, 0, 0}, {4488, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{986, 0, 0}, {30932, 0, 0}, {22079, 0, 0}, {15164, 0, 0}, {11146, 0, 0},
+     {5250, 0, 0}, {369, 0, 0}, {28349, 0, 0}, {16474, 0, 0}, {4423, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{867, 0, 0}, {22457, 0, 0}, {14721, 0, 0}, {7962, 0, 0}, {9480, 0, 0},
+     {4854, 0, 0}, {472, 0, 0}, {28553, 0, 0}, {17012, 0, 0}, {4427, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{6042, 0, 0}, {31723, 0, 0}, {21065, 0, 0}, {12178, 0, 0}, {14214, 0, 0},
+     {6798, 0, 0}, {830, 0, 0}, {27185, 0, 0}, {11455, 0, 0}, {3378, 0, 0},
+     {32127, 0, 0}, {10503, 0, 0}, {1316, 0, 0}},
+    {{6184, 0, 0}, {32580, 0, 0}, {23921, 0, 0}, {8249, 0, 0}, {9830, 0, 0},
+     {2185, 0, 0}, {160, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{3154, 0, 0}, {23700, 0, 0}, {19844, 0, 0}, {13230, 0, 0}, {15031, 0, 0},
+     {8149, 0, 0}, {2126, 0, 0}, {28649, 0, 0}, {16742, 0, 0}, {7111, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{811, 0, 0}, {29538, 0, 0}, {21615, 0, 0}, {14645, 0, 0}, {12625, 0, 0},
+     {6232, 0, 0}, {782, 0, 0}, {29718, 0, 0}, {18165, 0, 0}, {7613, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{405, 0, 0}, {22076, 0, 0}, {13678, 0, 0}, {8411, 0, 0}, {8326, 0, 0},
+     {4456, 0, 0}, {599, 0, 0}, {29120, 0, 0}, {17078, 0, 0}, {5953, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{2099, 0, 0}, {28936, 0, 0}, {21105, 0, 0}, {13879, 0, 0}, {12986, 0, 0},
+     {9455, 0, 0}, {1438, 0, 0}, {27644, 0, 0}, {14049, 0, 0}, {4300, 0, 0},
+     {29686, 0, 0}, {11786, 0, 0}, {3325, 0, 0}},
+    {{4195, 0, 0}, {29585, 0, 0}, {14966, 0, 0}, {6791, 0, 0}, {6091, 0, 0},
+     {4936, 0, 0}, {381, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{5881, 0, 0}, {26039, 0, 0}, {22407, 0, 0}, {15326, 0, 0}, {17723, 0, 0},
+     {10290, 0, 0}, {3696, 0, 0}, {30055, 0, 0}, {20907, 0, 0}, {11995, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{865, 0, 0}, {30724, 0, 0}, {25240, 0, 0}, {18150, 0, 0}, {16586, 0, 0},
+     {8600, 0, 0}, {1731, 0, 0}, {29982, 0, 0}, {21574, 0, 0}, {12613, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{258, 0, 0}, {24338, 0, 0}, {15450, 0, 0}, {8614, 0, 0}, {9094, 0, 0},
+     {3979, 0, 0}, {629, 0, 0}, {29328, 0, 0}, {19651, 0, 0}, {10066, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{1097, 0, 0}, {30712, 0, 0}, {21022, 0, 0}, {15916, 0, 0}, {14133, 0, 0},
+     {8053, 0, 0}, {1284, 0, 0}, {28112, 0, 0}, {16694, 0, 0}, {8064, 0, 0},
+     {30962, 0, 0}, {18123, 0, 0}, {7432, 0, 0}},
+    {{1229, 0, 0}, {24335, 0, 0}, {12192, 0, 0}, {4864, 0, 0}, {4916, 0, 0},
+     {2742, 0, 0}, {327, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultInterTxTypeCdf[3][kNumExtendedTransformSizes][kNumTransformTypes +
+                                                          1] = {
+        {{28310, 27208, 25073, 23059, 19438, 17979, 15231, 12502, 11264, 9920,
+          8834, 7294, 5041, 3853, 2137, 0, 0},
+         {31123, 30195, 27990, 27057, 24961, 24146, 22246, 17411, 15094, 12360,
+          10251, 7758, 5652, 3912, 2019, 0, 0},
+         {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+          10240, 8192, 6144, 4096, 2048, 0, 0},
+         {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+          10240, 8192, 6144, 4096, 2048, 0, 0}},
+        {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+         // Only 16x16 is used in this case.
+         {31998, 30347, 27543, 19861, 16949, 13841, 11207, 8679, 6173, 4242,
+          2239, 0},
+         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+        {{16384, 0, 0}, {28601, 0, 0}, {30770, 0, 0}, {32020, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultIntraTxTypeCdf
+    [2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+    [kNumTransformTypes + 1] = {
+        {{{31233, 24733, 23307, 20017, 9301, 4943, 0, 0},
+          {32204, 29433, 23059, 21898, 14625, 4674, 0, 0},
+          {32096, 29521, 29092, 20786, 13353, 9641, 0, 0},
+          {27489, 18883, 17281, 14724, 9241, 2516, 0, 0},
+          {28345, 26694, 24783, 22352, 7075, 3470, 0, 0},
+          {31282, 28527, 23308, 22106, 16312, 5074, 0, 0},
+          {32329, 29930, 29246, 26031, 14710, 9014, 0, 0},
+          {31578, 28535, 27913, 21098, 12487, 8391, 0, 0},
+          {31723, 28456, 24121, 22609, 14124, 3433, 0, 0},
+          {32566, 29034, 28021, 25470, 15641, 8752, 0, 0},
+          {32321, 28456, 25949, 23884, 16758, 8910, 0, 0},
+          {32491, 28399, 27513, 23863, 16303, 10497, 0, 0},
+          {29359, 27332, 22169, 17169, 13081, 8728, 0, 0}},
+         {{30898, 19026, 18238, 16270, 8998, 5070, 0, 0},
+          {32442, 23972, 18136, 17689, 13496, 5282, 0, 0},
+          {32284, 25192, 25056, 18325, 13609, 10177, 0, 0},
+          {31642, 17428, 16873, 15745, 11872, 2489, 0, 0},
+          {32113, 27914, 27519, 26855, 10669, 5630, 0, 0},
+          {31469, 26310, 23883, 23478, 17917, 7271, 0, 0},
+          {32457, 27473, 27216, 25883, 16661, 10096, 0, 0},
+          {31885, 24709, 24498, 21510, 15479, 11219, 0, 0},
+          {32027, 25188, 23450, 22423, 16080, 3722, 0, 0},
+          {32658, 25362, 24853, 23573, 16727, 9439, 0, 0},
+          {32405, 24794, 23411, 22095, 17139, 8294, 0, 0},
+          {32615, 25121, 24656, 22832, 17461, 12772, 0, 0},
+          {29257, 26436, 21603, 17433, 13445, 9174, 0, 0}}},
+        {{{26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0}},
+         {{26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0}},
+         {{31641, 19954, 9996, 5285, 0, 0},
+          {32623, 26007, 20788, 6101, 0, 0},
+          {32406, 26881, 21090, 16043, 0, 0},
+          {32383, 17555, 14181, 2075, 0, 0},
+          {32743, 29854, 9634, 4865, 0, 0},
+          {32708, 28298, 21019, 8777, 0, 0},
+          {32731, 29436, 18257, 11320, 0, 0},
+          {32611, 26448, 19732, 15329, 0, 0},
+          {32649, 26049, 19862, 3372, 0, 0},
+          {32721, 27231, 20192, 11269, 0, 0},
+          {32499, 26692, 21510, 9653, 0, 0},
+          {32685, 27153, 20767, 15540, 0, 0},
+          {30800, 27212, 20745, 14221, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt16Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt16SymbolCount + 1] = {
+                          {{{31928, 31729, 30788, 27873, 0, 0},
+                            {32398, 32097, 30885, 28297, 0, 0}},
+                           {{29521, 27818, 23080, 18205, 0, 0},
+                            {30864, 29414, 25005, 18121, 0, 0}}},
+                          {{{30643, 30217, 27603, 23822, 0, 0},
+                            {32255, 32003, 30909, 26429, 0, 0}},
+                           {{25131, 23270, 18509, 13660, 0, 0},
+                            {30271, 28672, 23902, 15775, 0, 0}}},
+                          {{{28752, 27871, 23887, 17800, 0, 0},
+                            {32052, 31663, 30122, 22712, 0, 0}},
+                           {{21629, 19498, 14527, 9202, 0, 0},
+                            {29576, 27736, 22471, 13013, 0, 0}}},
+                          {{{26060, 23810, 18022, 10635, 0, 0},
+                            {31546, 30694, 27985, 17358, 0, 0}},
+                           {{13193, 11002, 6724, 3059, 0, 0},
+                            {25471, 22001, 13495, 4574, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt32Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt32SymbolCount + 1] = {
+                          {{{32368, 32248, 31791, 30666, 26226, 0, 0},
+                            {32558, 32363, 31453, 29442, 25231, 0, 0}},
+                           {{30132, 28495, 25180, 20974, 12367, 0, 0},
+                            {30982, 29589, 25866, 21411, 13714, 0, 0}}},
+                          {{{31779, 31519, 30749, 28617, 21983, 0, 0},
+                            {32455, 32327, 31669, 29851, 24206, 0, 0}},
+                           {{24374, 22416, 18836, 13913, 6754, 0, 0},
+                            {30190, 28644, 24587, 19098, 8534, 0, 0}}},
+                          {{{30253, 29765, 28316, 24606, 16727, 0, 0},
+                            {32194, 31947, 30932, 27679, 19640, 0, 0}},
+                           {{19300, 16465, 12407, 7663, 3487, 0, 0},
+                            {29226, 27266, 22353, 16008, 7124, 0, 0}}},
+                          {{{28151, 27059, 24322, 19184, 9633, 0, 0},
+                            {31612, 31066, 29093, 23494, 12229, 0, 0}},
+                           {{10682, 8486, 5758, 2998, 1025, 0, 0},
+                            {25069, 21871, 11877, 5842, 1140, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt64Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt64SymbolCount + 1] = {
+                          {{{32439, 32270, 31667, 30984, 29503, 25010, 0, 0},
+                            {32433, 32038, 31309, 27274, 24013, 19771, 0, 0}},
+                           {{29263, 27464, 22682, 18954, 15084, 9398, 0, 0},
+                            {31205, 30068, 27892, 21857, 18062, 10288, 0, 0}}},
+                          {{{31508, 31322, 30515, 29056, 26116, 19399, 0, 0},
+                            {32367, 32163, 31739, 30205, 26923, 20142, 0, 0}},
+                           {{24159, 22156, 18144, 14054, 10154, 3744, 0, 0},
+                            {30845, 29641, 26901, 23065, 18491, 5668, 0, 0}}},
+                          {{{30394, 29996, 28185, 25492, 20480, 13062, 0, 0},
+                            {32271, 31958, 31453, 29768, 25764, 17127, 0, 0}},
+                           {{17718, 15642, 11358, 7882, 4612, 2042, 0, 0},
+                            {28734, 26478, 22533, 17786, 11554, 4277, 0, 0}}},
+                          {{{26461, 25227, 20708, 16410, 10215, 4903, 0, 0},
+                            {31479, 30448, 28797, 24842, 18615, 8477, 0, 0}},
+                           {{8556, 7060, 4500, 2733, 1461, 719, 0, 0},
+                            {24042, 20390, 13359, 6318, 2730, 306, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt128Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+    [kEobPt128SymbolCount + 1] = {
+        {{{32549, 32286, 31628, 30677, 29088, 26740, 20182, 0, 0},
+          {32397, 32069, 31514, 27938, 23289, 20206, 15271, 0, 0}},
+         {{27523, 25312, 19888, 16916, 12735, 8836, 5160, 0, 0},
+          {30714, 29296, 26899, 18536, 14526, 12178, 6016, 0, 0}}},
+        {{{32083, 31835, 31280, 30054, 28002, 24206, 13514, 0, 0},
+          {32551, 32416, 32150, 30465, 27507, 22799, 15296, 0, 0}},
+         {{24723, 21568, 17271, 13173, 8820, 5360, 1830, 0, 0},
+          {30458, 28608, 25297, 17771, 14837, 12000, 2528, 0, 0}}},
+        {{{31402, 31030, 30241, 27752, 23413, 16971, 8125, 0, 0},
+          {32414, 32210, 31824, 30008, 25481, 18731, 10989, 0, 0}},
+         {{19141, 16522, 12595, 8339, 4820, 2353, 905, 0, 0},
+          {26493, 22879, 17999, 9604, 4780, 2275, 496, 0, 0}}},
+        {{{29296, 27883, 25279, 20287, 14251, 8232, 3133, 0, 0},
+          {31882, 31037, 29497, 24299, 17199, 10642, 4385, 0, 0}},
+         {{8455, 6706, 4383, 2661, 1551, 870, 423, 0, 0},
+          {23603, 19486, 11618, 2482, 874, 197, 56, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt256Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+    [kEobPt256SymbolCount + 1] = {
+        {{{32458, 32184, 30881, 29179, 26600, 24157, 21416, 17116, 0, 0},
+          {31770, 30918, 29770, 27164, 15427, 12880, 9869, 7185, 0, 0}},
+         {{30248, 29528, 26816, 23898, 20191, 15210, 12814, 8600, 0, 0},
+          {30565, 28638, 25333, 22029, 12116, 9087, 7159, 5507, 0, 0}}},
+        {{{31320, 30659, 28617, 26505, 23439, 19508, 14824, 9468, 0, 0},
+          {32369, 31749, 31019, 29730, 22324, 17222, 10029, 5474, 0, 0}},
+         {{26366, 24620, 20145, 17696, 14040, 9921, 6321, 3391, 0, 0},
+          {31094, 29516, 27034, 22609, 10371, 8966, 7947, 1828, 0, 0}}},
+        {{{29679, 28848, 26730, 23308, 18502, 12887, 7002, 3592, 0, 0},
+          {31684, 30410, 29280, 27646, 21285, 14665, 6745, 2969, 0, 0}},
+         {{21254, 18974, 15288, 12014, 8407, 5390, 3276, 1491, 0, 0},
+          {26197, 23158, 17252, 10942, 3676, 1939, 926, 60, 0, 0}}},
+        {{{27420, 25655, 20948, 16844, 10662, 5991, 2434, 1011, 0, 0},
+          {30315, 28294, 26461, 23991, 16294, 9793, 3768, 1221, 0, 0}},
+         {{9658, 8171, 5628, 3874, 2601, 1841, 1376, 674, 0, 0},
+          {22770, 15107, 7590, 4671, 1460, 730, 365, 73, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt512Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPt512SymbolCount + 1] =
+        {{{32127, 31785, 29061, 27338, 22534, 17810, 13980, 9356, 6707, 0, 0},
+          {27673, 26322, 22772, 19414, 16751, 14782, 11849, 6639, 3628, 0, 0}},
+         {{31538, 30490, 27733, 24992, 20897, 17422, 13178, 8184, 4019, 0, 0},
+          {25503, 22789, 16949, 13518, 10988, 8922, 6290, 4372, 957, 0, 0}},
+         {{30144, 28832, 26288, 23082, 18789, 15042, 9501, 4358, 1690, 0, 0},
+          {20753, 17999, 13180, 10716, 8546, 6956, 5468, 3549, 654, 0, 0}},
+         {{26841, 24959, 21845, 18171, 13329, 8633, 4312, 1626, 708, 0, 0},
+          {11675, 9725, 7026, 5110, 3671, 3052, 2695, 1948, 812, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt1024Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                        [kEobPt1024SymbolCount + 1] = {
+                            {{32375, 32347, 32017, 31145, 29608, 26416, 19423,
+                              14721, 10197, 6938, 0, 0},
+                             {30903, 30780, 29838, 28526, 22235, 16230, 11414,
+                              5513, 4222, 984, 0, 0}},
+                            {{32072, 31820, 29623, 27066, 23062, 19551, 14917,
+                              10912, 7076, 4734, 0, 0},
+                             {30096, 29177, 23438, 15684, 10043, 8484, 6241,
+                              4741, 4391, 1892, 0, 0}},
+                            {{29984, 28937, 25727, 22247, 17921, 13924, 9613,
+                              6086, 3539, 1723, 0, 0},
+                             {23191, 20302, 15029, 12018, 10707, 9553, 8167,
+                              7285, 6925, 712, 0, 0}},
+                            {{26070, 24434, 20807, 17006, 12582, 8906, 5334,
+                              3442, 1686, 718, 0, 0},
+                             {12199, 10342, 7199, 5909, 4715, 3855, 3282, 3044,
+                              2961, 198, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
+                                  [kNumSquareTransformSizes][kNumPlaneTypes]
+                                  [kEobExtraContexts][kBooleanFieldCdfSize] = {
+  {
+    {
+      {{15807, 0, 0}, {15545, 0, 0}, {25147, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{13699, 0, 0}, {10243, 0, 0}, {19391, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12367, 0, 0}, {15743, 0, 0}, {19923, 0, 0}, {19895, 0, 0},
+       {18674, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12087, 0, 0}, {12067, 0, 0}, {17518, 0, 0}, {17751, 0, 0},
+       {17840, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{8863, 0, 0}, {15574, 0, 0}, {16598, 0, 0}, {15073, 0, 0},
+       {18942, 0, 0}, {16958, 0, 0}, {20732, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{8809, 0, 0}, {11969, 0, 0}, {13747, 0, 0}, {16565, 0, 0},
+       {14882, 0, 0}, {18624, 0, 0}, {20758, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{5369, 0, 0}, {16441, 0, 0}, {14697, 0, 0}, {13184, 0, 0},
+       {12047, 0, 0}, {14336, 0, 0}, {13208, 0, 0}, {22618, 0, 0},
+       {23963, 0, 0}},
+      {{7836, 0, 0}, {11935, 0, 0}, {20741, 0, 0}, {16098, 0, 0},
+       {12854, 0, 0}, {17662, 0, 0}, {15106, 0, 0}, {18985, 0, 0},
+       {4012, 0, 0}}
+    },
+    {
+      {{9362, 0, 0}, {10923, 0, 0}, {14336, 0, 0}, {16384, 0, 0},
+       {15672, 0, 0}, {20207, 0, 0}, {15448, 0, 0}, {10373, 0, 0},
+       {11398, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{15297, 0, 0}, {12545, 0, 0}, {21411, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12433, 0, 0}, {11101, 0, 0}, {17950, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12338, 0, 0}, {12106, 0, 0}, {17401, 0, 0}, {15798, 0, 0},
+       {18111, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10651, 0, 0}, {10740, 0, 0}, {14118, 0, 0}, {16726, 0, 0},
+       {16883, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{10359, 0, 0}, {11756, 0, 0}, {17118, 0, 0}, {15373, 0, 0},
+       {17299, 0, 0}, {12563, 0, 0}, {13257, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{8548, 0, 0}, {10288, 0, 0}, {15031, 0, 0}, {13852, 0, 0},
+       {13500, 0, 0}, {14356, 0, 0}, {13924, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{6777, 0, 0}, {12454, 0, 0}, {15037, 0, 0}, {13090, 0, 0},
+       {14119, 0, 0}, {15461, 0, 0}, {10970, 0, 0}, {15219, 0, 0},
+       {17138, 0, 0}},
+      {{6183, 0, 0}, {11299, 0, 0}, {12336, 0, 0}, {15033, 0, 0},
+       {13488, 0, 0}, {17533, 0, 0}, {12471, 0, 0}, {10297, 0, 0},
+       {3771, 0, 0}}
+    },
+    {
+      {{6163, 0, 0}, {21464, 0, 0}, {16042, 0, 0}, {16208, 0, 0},
+       {11902, 0, 0}, {9244, 0, 0}, {12890, 0, 0}, {19299, 0, 0},
+       {9684, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{13785, 0, 0}, {12256, 0, 0}, {17883, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12678, 0, 0}, {13324, 0, 0}, {15482, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{13629, 0, 0}, {11281, 0, 0}, {13809, 0, 0}, {11858, 0, 0},
+       {13679, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12232, 0, 0}, {12104, 0, 0}, {12143, 0, 0}, {13645, 0, 0},
+       {17906, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12935, 0, 0}, {11266, 0, 0}, {15283, 0, 0}, {12501, 0, 0},
+       {14415, 0, 0}, {9439, 0, 0}, {11290, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10727, 0, 0}, {9334, 0, 0}, {12767, 0, 0}, {12214, 0, 0},
+       {11817, 0, 0}, {12623, 0, 0}, {17206, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{9456, 0, 0}, {11161, 0, 0}, {16242, 0, 0}, {13811, 0, 0},
+       {14734, 0, 0}, {13834, 0, 0}, {8521, 0, 0}, {15847, 0, 0},
+       {15688, 0, 0}},
+      {{6189, 0, 0}, {7858, 0, 0}, {14131, 0, 0}, {12968, 0, 0},
+       {12380, 0, 0}, {22881, 0, 0}, {17126, 0, 0}, {2570, 0, 0},
+       {8047, 0, 0}}
+    },
+    {
+      {{5770, 0, 0}, {16031, 0, 0}, {14930, 0, 0}, {13846, 0, 0},
+       {13253, 0, 0}, {14132, 0, 0}, {15435, 0, 0}, {16992, 0, 0},
+       {10110, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{12591, 0, 0}, {11979, 0, 0}, {12506, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{11352, 0, 0}, {11913, 0, 0}, {9358, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12530, 0, 0}, {11711, 0, 0}, {13609, 0, 0}, {10431, 0, 0},
+       {12609, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12643, 0, 0}, {12209, 0, 0}, {11061, 0, 0}, {10472, 0, 0},
+       {15435, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12827, 0, 0}, {12241, 0, 0}, {11298, 0, 0}, {10281, 0, 0},
+       {13210, 0, 0}, {10414, 0, 0}, {12437, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10016, 0, 0}, {7762, 0, 0}, {10693, 0, 0}, {11192, 0, 0},
+       {15028, 0, 0}, {11078, 0, 0}, {13557, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{11326, 0, 0}, {10410, 0, 0}, {14265, 0, 0}, {12477, 0, 0},
+       {12823, 0, 0}, {11474, 0, 0}, {11590, 0, 0}, {13368, 0, 0},
+       {22212, 0, 0}},
+      {{8120, 0, 0}, {7819, 0, 0}, {12060, 0, 0}, {8863, 0, 0},
+       {12267, 0, 0}, {23210, 0, 0}, {23345, 0, 0}, {2403, 0, 0},
+       {13515, 0, 0}}
+    },
+    {
+      {{6704, 0, 0}, {10670, 0, 0}, {13155, 0, 0}, {12243, 0, 0},
+       {15173, 0, 0}, {16150, 0, 0}, {12271, 0, 0}, {13779, 0, 0},
+       {17255, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  }
+};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
+                                      [kNumSquareTransformSizes][kNumPlaneTypes]
+                                      [kCoeffBaseEobContexts]
+                                      [kCoeffBaseEobSymbolCount + 1] = {
+  {
+    {
+      {{14931, 3713, 0, 0}, {3168, 1322, 0, 0}, {1924, 890, 0, 0},
+       {7842, 3820, 0, 0}},
+      {{11403, 2742, 0, 0}, {2256, 345, 0, 0}, {1110, 147, 0, 0},
+       {3138, 887, 0, 0}}
+    },
+    {
+      {{27051, 6291, 0, 0}, {2277, 1065, 0, 0}, {1218, 610, 0, 0},
+       {3120, 1277, 0, 0}},
+      {{20160, 4948, 0, 0}, {2088, 543, 0, 0}, {1959, 433, 0, 0},
+       {1469, 345, 0, 0}}
+    },
+    {
+      {{30982, 20156, 0, 0}, {2105, 1143, 0, 0}, {429, 300, 0, 0},
+       {1620, 935, 0, 0}},
+      {{13911, 8903, 0, 0}, {1340, 340, 0, 0}, {1024, 395, 0, 0},
+       {993, 242, 0, 0}}
+    },
+    {
+      {{30981, 30236, 0, 0}, {1936, 1106, 0, 0}, {944, 86, 0, 0},
+       {635, 199, 0, 0}},
+      {{19017, 10533, 0, 0}, {679, 359, 0, 0}, {5684, 4848, 0, 0},
+       {3477, 174, 0, 0}}
+    },
+    {
+      {{31043, 29319, 0, 0}, {1666, 833, 0, 0}, {311, 155, 0, 0},
+       {356, 119, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{15208, 2880, 0, 0}, {3097, 1219, 0, 0}, {1761, 712, 0, 0},
+       {5482, 2762, 0, 0}},
+      {{6174, 1556, 0, 0}, {1560, 186, 0, 0}, {933, 131, 0, 0},
+       {2173, 562, 0, 0}}
+    },
+    {
+      {{17529, 2836, 0, 0}, {1453, 673, 0, 0}, {638, 334, 0, 0},
+       {1904, 772, 0, 0}},
+      {{6489, 1800, 0, 0}, {1626, 273, 0, 0}, {1055, 228, 0, 0},
+       {839, 174, 0, 0}}
+    },
+    {
+      {{30124, 7570, 0, 0}, {730, 317, 0, 0}, {129, 73, 0, 0},
+       {602, 250, 0, 0}},
+      {{15581, 5100, 0, 0}, {1054, 218, 0, 0}, {485, 90, 0, 0},
+       {838, 205, 0, 0}}
+    },
+    {
+      {{31724, 30511, 0, 0}, {2013, 845, 0, 0}, {560, 75, 0, 0},
+       {524, 153, 0, 0}},
+      {{11451, 6561, 0, 0}, {3635, 1900, 0, 0}, {3457, 1537, 0, 0},
+       {3111, 1681, 0, 0}}
+    },
+    {
+      {{32290, 30934, 0, 0}, {1763, 781, 0, 0}, {451, 44, 0, 0},
+       {1903, 120, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{12676, 1994, 0, 0}, {2073, 748, 0, 0}, {1637, 665, 0, 0},
+       {4102, 1898, 0, 0}},
+      {{5510, 1673, 0, 0}, {964, 145, 0, 0}, {1005, 240, 0, 0},
+       {1330, 262, 0, 0}}
+    },
+    {
+      {{14719, 2279, 0, 0}, {1062, 482, 0, 0}, {605, 295, 0, 0},
+       {1218, 584, 0, 0}},
+      {{5652, 1926, 0, 0}, {797, 170, 0, 0}, {680, 192, 0, 0},
+       {701, 104, 0, 0}}
+    },
+    {
+      {{19914, 3675, 0, 0}, {496, 210, 0, 0}, {101, 39, 0, 0},
+       {462, 183, 0, 0}},
+      {{7292, 2402, 0, 0}, {599, 81, 0, 0}, {289, 79, 0, 0},
+       {1095, 134, 0, 0}}
+    },
+    {
+      {{29959, 13467, 0, 0}, {563, 146, 0, 0}, {430, 38, 0, 0},
+       {982, 152, 0, 0}},
+      {{10031, 3663, 0, 0}, {1958, 406, 0, 0}, {2754, 141, 0, 0},
+       {2240, 194, 0, 0}}
+    },
+    {
+      {{31833, 29386, 0, 0}, {1979, 859, 0, 0}, {302, 12, 0, 0},
+       {1908, 255, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{10271, 1570, 0, 0}, {1053, 273, 0, 0}, {1162, 431, 0, 0},
+       {2380, 778, 0, 0}},
+      {{4891, 1184, 0, 0}, {598, 40, 0, 0}, {613, 80, 0, 0},
+       {549, 66, 0, 0}}
+    },
+    {
+      {{11311, 1725, 0, 0}, {817, 285, 0, 0}, {615, 206, 0, 0},
+       {1295, 553, 0, 0}},
+      {{5210, 1617, 0, 0}, {748, 128, 0, 0}, {671, 193, 0, 0},
+       {526, 49, 0, 0}}
+    },
+    {
+      {{12788, 2177, 0, 0}, {549, 171, 0, 0}, {187, 62, 0, 0},
+       {965, 481, 0, 0}},
+      {{6295, 2261, 0, 0}, {337, 45, 0, 0}, {572, 157, 0, 0},
+       {1180, 240, 0, 0}}
+    },
+    {
+      {{8121, 2305, 0, 0}, {356, 73, 0, 0}, {300, 48, 0, 0},
+       {1499, 245, 0, 0}},
+      {{4286, 1263, 0, 0}, {616, 67, 0, 0}, {1036, 170, 0, 0},
+       {1001, 56, 0, 0}}
+    },
+    {
+      {{20410, 7791, 0, 0}, {1437, 383, 0, 0}, {134, 12, 0, 0},
+       {2357, 220, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseCdf
+    [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+    [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1] = {
+        {{{{28734, 23838, 20041, 0, 0}, {14686, 3027, 891, 0, 0},
+           {20172, 6644, 2275, 0, 0},   {23322, 11650, 5763, 0, 0},
+           {26460, 17627, 11489, 0, 0}, {30305, 26411, 22985, 0, 0},
+           {12101, 2222, 839, 0, 0},    {19725, 6645, 2634, 0, 0},
+           {24617, 14011, 7990, 0, 0},  {27513, 19929, 14136, 0, 0},
+           {29948, 25562, 21607, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {17032, 5215, 2164, 0, 0},
+           {21558, 8974, 3981, 0, 0},   {26821, 18894, 13067, 0, 0},
+           {28553, 23445, 18877, 0, 0}, {29935, 26306, 22709, 0, 0},
+           {13163, 2375, 1186, 0, 0},   {19245, 6516, 2520, 0, 0},
+           {24322, 14146, 8256, 0, 0},  {28950, 22425, 16794, 0, 0},
+           {31287, 28651, 25972, 0, 0}, {10119, 1466, 578, 0, 0},
+           {17939, 5641, 2319, 0, 0},   {24455, 15066, 9464, 0, 0},
+           {29746, 24467, 19982, 0, 0}, {31232, 28356, 25584, 0, 0},
+           {10414, 2994, 1396, 0, 0},   {18045, 7296, 3554, 0, 0},
+           {26095, 19023, 14106, 0, 0}, {30700, 27002, 23446, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{26466, 16324, 11007, 0, 0}, {9728, 1230, 293, 0, 0},
+           {17572, 4316, 1272, 0, 0},   {22748, 9822, 4254, 0, 0},
+           {26235, 15906, 9267, 0, 0},  {29230, 22952, 17692, 0, 0},
+           {8324, 893, 243, 0, 0},      {16887, 3844, 1133, 0, 0},
+           {22846, 9895, 4302, 0, 0},   {26241, 15802, 9077, 0, 0},
+           {28654, 21465, 15548, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {12567, 1998, 559, 0, 0},
+           {18014, 4697, 1510, 0, 0},   {24390, 12582, 6251, 0, 0},
+           {26852, 17469, 10790, 0, 0}, {28500, 21185, 14867, 0, 0},
+           {8407, 743, 187, 0, 0},      {14095, 2663, 825, 0, 0},
+           {22572, 10524, 5192, 0, 0},  {27273, 18419, 12351, 0, 0},
+           {30092, 25353, 21270, 0, 0}, {8090, 810, 183, 0, 0},
+           {14139, 2862, 937, 0, 0},    {23404, 12044, 6453, 0, 0},
+           {28127, 20450, 14674, 0, 0}, {30010, 25381, 21189, 0, 0},
+           {7335, 926, 299, 0, 0},      {13973, 3479, 1357, 0, 0},
+           {25124, 15184, 9176, 0, 0},  {29360, 23754, 17721, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28232, 22696, 18767, 0, 0}, {7309, 1352, 562, 0, 0},
+           {16163, 4720, 1950, 0, 0},   {21760, 9911, 5049, 0, 0},
+           {25853, 16500, 10453, 0, 0}, {30143, 25956, 22231, 0, 0},
+           {8511, 980, 269, 0, 0},      {15888, 3314, 889, 0, 0},
+           {20810, 7714, 2990, 0, 0},   {24852, 14050, 7684, 0, 0},
+           {29385, 23991, 19322, 0, 0}, {10048, 1165, 375, 0, 0},
+           {17808, 4643, 1433, 0, 0},   {23037, 10558, 4840, 0, 0},
+           {26464, 16936, 10491, 0, 0}, {29858, 24950, 20602, 0, 0},
+           {12393, 2141, 637, 0, 0},    {18864, 5484, 1881, 0, 0},
+           {23400, 11210, 5624, 0, 0},  {26831, 17802, 11649, 0, 0},
+           {30101, 25543, 21449, 0, 0}, {8798, 1298, 390, 0, 0},
+           {15595, 3034, 750, 0, 0},    {19973, 7327, 2803, 0, 0},
+           {23787, 13088, 6875, 0, 0},  {28040, 21396, 15866, 0, 0},
+           {8481, 971, 329, 0, 0},      {16065, 3623, 1072, 0, 0},
+           {21935, 9214, 4043, 0, 0},   {26300, 16202, 9711, 0, 0},
+           {30353, 26206, 22490, 0, 0}, {6158, 373, 109, 0, 0},
+           {14178, 2270, 651, 0, 0},    {20348, 7012, 2818, 0, 0},
+           {25129, 14022, 8058, 0, 0},  {29767, 24682, 20421, 0, 0},
+           {7692, 704, 188, 0, 0},      {14822, 2640, 740, 0, 0},
+           {20744, 7783, 3390, 0, 0},   {25251, 14378, 8464, 0, 0},
+           {29525, 23987, 19437, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{26731, 15997, 10811, 0, 0}, {7994, 1064, 342, 0, 0},
+           {15938, 4179, 1712, 0, 0},   {22166, 9940, 5008, 0, 0},
+           {26035, 15939, 9697, 0, 0},  {29518, 23854, 19212, 0, 0},
+           {7186, 548, 100, 0, 0},      {14109, 2426, 545, 0, 0},
+           {20222, 6619, 2253, 0, 0},   {24348, 12317, 5967, 0, 0},
+           {28132, 20348, 14424, 0, 0}, {5187, 406, 129, 0, 0},
+           {13781, 2685, 790, 0, 0},    {21441, 8520, 3684, 0, 0},
+           {25504, 15049, 8648, 0, 0},  {28773, 22000, 16599, 0, 0},
+           {6875, 937, 281, 0, 0},      {16191, 4181, 1389, 0, 0},
+           {22579, 10020, 4586, 0, 0},  {25936, 15674, 9212, 0, 0},
+           {29060, 22658, 17434, 0, 0}, {6864, 486, 112, 0, 0},
+           {13047, 1976, 492, 0, 0},    {19949, 6525, 2357, 0, 0},
+           {24196, 12154, 5877, 0, 0},  {27404, 18709, 12301, 0, 0},
+           {6188, 330, 91, 0, 0},       {11916, 1543, 428, 0, 0},
+           {20333, 7068, 2801, 0, 0},   {24077, 11943, 5792, 0, 0},
+           {28322, 20559, 15499, 0, 0}, {5418, 339, 72, 0, 0},
+           {11396, 1791, 496, 0, 0},    {20095, 7498, 2915, 0, 0},
+           {23560, 11843, 6128, 0, 0},  {27750, 19417, 14036, 0, 0},
+           {5417, 289, 55, 0, 0},       {11370, 1559, 381, 0, 0},
+           {20606, 7721, 2926, 0, 0},   {24872, 14077, 7449, 0, 0},
+           {28098, 19886, 13887, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{27281, 22308, 19060, 0, 0}, {11171, 4465, 2094, 0, 0},
+           {21731, 10815, 6292, 0, 0},  {24621, 14806, 9816, 0, 0},
+           {27526, 19707, 14236, 0, 0}, {30879, 27560, 24586, 0, 0},
+           {5994, 635, 178, 0, 0},      {14924, 3204, 1001, 0, 0},
+           {21078, 8330, 3597, 0, 0},   {25226, 14553, 8309, 0, 0},
+           {29775, 24718, 20449, 0, 0}, {4745, 440, 177, 0, 0},
+           {14117, 2642, 814, 0, 0},    {20604, 7622, 3179, 0, 0},
+           {25006, 14238, 7997, 0, 0},  {29276, 23585, 18848, 0, 0},
+           {5177, 760, 277, 0, 0},      {15619, 3915, 1258, 0, 0},
+           {21283, 8765, 3908, 0, 0},   {25071, 14682, 8558, 0, 0},
+           {29693, 24769, 20550, 0, 0}, {4500, 286, 114, 0, 0},
+           {13137, 1717, 364, 0, 0},    {18908, 5508, 1748, 0, 0},
+           {23163, 11155, 5174, 0, 0},  {27892, 20606, 14860, 0, 0},
+           {5520, 452, 192, 0, 0},      {13813, 2311, 693, 0, 0},
+           {20944, 8771, 3973, 0, 0},   {25422, 14572, 8121, 0, 0},
+           {29365, 23521, 18657, 0, 0}, {3057, 113, 33, 0, 0},
+           {11599, 1374, 351, 0, 0},    {19281, 5570, 1811, 0, 0},
+           {23940, 11085, 5154, 0, 0},  {28498, 21317, 15730, 0, 0},
+           {4060, 190, 37, 0, 0},       {12648, 1527, 286, 0, 0},
+           {19076, 5218, 1447, 0, 0},   {23350, 10254, 4329, 0, 0},
+           {27769, 19485, 13306, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{27095, 18466, 13057, 0, 0}, {6517, 2067, 934, 0, 0},
+           {19986, 8985, 4965, 0, 0},   {23641, 12111, 6960, 0, 0},
+           {26400, 16560, 11306, 0, 0}, {30303, 25591, 21946, 0, 0},
+           {2807, 205, 49, 0, 0},       {14450, 2877, 819, 0, 0},
+           {21407, 8254, 3411, 0, 0},   {24868, 13165, 7161, 0, 0},
+           {28766, 22178, 17222, 0, 0}, {3131, 458, 173, 0, 0},
+           {14472, 2855, 959, 0, 0},    {22624, 11253, 5897, 0, 0},
+           {27410, 18446, 12374, 0, 0}, {29701, 24406, 19422, 0, 0},
+           {4116, 298, 92, 0, 0},       {15230, 1997, 559, 0, 0},
+           {18844, 5886, 2274, 0, 0},   {22272, 9931, 4899, 0, 0},
+           {25532, 16372, 11147, 0, 0}, {2025, 81, 22, 0, 0},
+           {9762, 1092, 279, 0, 0},     {18274, 4940, 1648, 0, 0},
+           {22594, 9967, 4416, 0, 0},   {26526, 17487, 11725, 0, 0},
+           {6951, 525, 48, 0, 0},       {14150, 1401, 443, 0, 0},
+           {18771, 4450, 890, 0, 0},    {20513, 6234, 1385, 0, 0},
+           {23207, 11180, 4318, 0, 0},  {4580, 133, 44, 0, 0},
+           {10708, 403, 40, 0, 0},      {14666, 2078, 240, 0, 0},
+           {18572, 3904, 769, 0, 0},    {20506, 6976, 1903, 0, 0},
+           {8592, 659, 140, 0, 0},      {14488, 3087, 805, 0, 0},
+           {22563, 9065, 3104, 0, 0},   {24879, 12743, 5092, 0, 0},
+           {26708, 16025, 8798, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{27627, 25672, 24508, 0, 0}, {5582, 3746, 2979, 0, 0},
+           {26100, 20200, 17086, 0, 0}, {30596, 26587, 24130, 0, 0},
+           {31642, 29389, 28237, 0, 0}, {32325, 31407, 30514, 0, 0},
+           {6685, 1615, 332, 0, 0},     {19282, 8165, 4285, 0, 0},
+           {26260, 17928, 12858, 0, 0}, {29382, 23968, 19482, 0, 0},
+           {31238, 28446, 25714, 0, 0}, {3129, 688, 220, 0, 0},
+           {16871, 5216, 2478, 0, 0},   {24180, 12721, 7385, 0, 0},
+           {27879, 19429, 13499, 0, 0}, {30528, 25897, 22270, 0, 0},
+           {4603, 571, 251, 0, 0},      {12033, 2341, 1200, 0, 0},
+           {18443, 8097, 5076, 0, 0},   {27649, 20214, 14963, 0, 0},
+           {30958, 27327, 24507, 0, 0}, {1556, 44, 20, 0, 0},
+           {9416, 1002, 223, 0, 0},     {18099, 5198, 1709, 0, 0},
+           {24276, 11874, 5496, 0, 0},  {29124, 22574, 17564, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{30307, 25755, 23397, 0, 0}, {8019, 3168, 1782, 0, 0},
+           {23302, 13731, 10351, 0, 0}, {29184, 23488, 18368, 0, 0},
+           {31263, 28839, 27335, 0, 0}, {32091, 31268, 30032, 0, 0},
+           {8781, 2066, 651, 0, 0},     {19214, 8197, 3505, 0, 0},
+           {26557, 18212, 11613, 0, 0}, {29633, 21796, 17143, 0, 0},
+           {30333, 25641, 21341, 0, 0}, {1468, 236, 218, 0, 0},
+           {18011, 2403, 814, 0, 0},    {28363, 21156, 14215, 0, 0},
+           {32188, 28636, 25446, 0, 0}, {31073, 22599, 18644, 0, 0},
+           {2760, 486, 177, 0, 0},      {13524, 2660, 1020, 0, 0},
+           {21588, 8610, 3213, 0, 0},   {27118, 17796, 13559, 0, 0},
+           {30654, 27659, 24312, 0, 0}, {912, 52, 20, 0, 0},
+           {9756, 1104, 196, 0, 0},     {19074, 6112, 2132, 0, 0},
+           {24626, 13260, 6675, 0, 0},  {28515, 21813, 16044, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{32167, 31785, 31457, 0, 0}, {14043, 9362, 4681, 0, 0},
+           {27307, 24576, 21845, 0, 0}, {28987, 17644, 11343, 0, 0},
+           {30181, 25007, 20696, 0, 0}, {32662, 32310, 31958, 0, 0},
+           {10486, 3058, 874, 0, 0},    {24260, 11842, 6784, 0, 0},
+           {29042, 20055, 14685, 0, 0}, {31148, 25656, 21875, 0, 0},
+           {32039, 30532, 29273, 0, 0}, {2605, 294, 84, 0, 0},
+           {14464, 2304, 768, 0, 0},    {21325, 6242, 3121, 0, 0},
+           {26761, 17476, 11469, 0, 0}, {30534, 26065, 23831, 0, 0},
+           {1814, 591, 197, 0, 0},      {15405, 3206, 1692, 0, 0},
+           {23082, 10304, 5358, 0, 0},  {24576, 16384, 11378, 0, 0},
+           {31013, 24722, 21504, 0, 0}, {1600, 34, 20, 0, 0},
+           {10282, 1327, 297, 0, 0},    {19935, 7141, 3030, 0, 0},
+           {25788, 15389, 9646, 0, 0},  {29657, 23881, 19289, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{26727, 20914, 16841, 0, 0}, {12442, 1863, 517, 0, 0},
+           {18604, 5937, 2043, 0, 0},   {23008, 12121, 6183, 0, 0},
+           {26352, 17815, 11549, 0, 0}, {29802, 25617, 21877, 0, 0},
+           {9201, 1394, 514, 0, 0},     {17790, 5352, 1822, 0, 0},
+           {23334, 12543, 6514, 0, 0},  {26110, 18210, 12233, 0, 0},
+           {28852, 24091, 19779, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {14680, 3223, 1181, 0, 0},
+           {19706, 6925, 2695, 0, 0},   {23828, 15941, 10517, 0, 0},
+           {25114, 19548, 14795, 0, 0}, {27035, 22452, 18312, 0, 0},
+           {9889, 1380, 654, 0, 0},     {17553, 4775, 1813, 0, 0},
+           {23371, 13323, 7790, 0, 0},  {29326, 22955, 17424, 0, 0},
+           {31400, 28832, 26236, 0, 0}, {7274, 735, 362, 0, 0},
+           {15996, 4805, 2050, 0, 0},   {23349, 14603, 9508, 0, 0},
+           {30091, 25267, 20971, 0, 0}, {31252, 28424, 25598, 0, 0},
+           {6212, 1314, 667, 0, 0},     {15640, 5733, 2660, 0, 0},
+           {24444, 17424, 12519, 0, 0}, {30865, 27072, 23299, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24313, 13765, 8400, 0, 0},  {9205, 747, 164, 0, 0},
+           {16531, 3322, 833, 0, 0},    {22044, 8769, 3410, 0, 0},
+           {26043, 15240, 8352, 0, 0},  {28841, 21841, 15943, 0, 0},
+           {6455, 480, 134, 0, 0},      {15338, 2673, 673, 0, 0},
+           {21652, 8162, 3089, 0, 0},   {25573, 14384, 7499, 0, 0},
+           {28042, 19916, 13453, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {9946, 1120, 285, 0, 0},
+           {16044, 3135, 839, 0, 0},    {22507, 9735, 4043, 0, 0},
+           {25739, 14928, 8240, 0, 0},  {27901, 18882, 11266, 0, 0},
+           {7470, 876, 277, 0, 0},      {14959, 3438, 1256, 0, 0},
+           {23100, 11439, 6189, 0, 0},  {27994, 19812, 13792, 0, 0},
+           {30446, 25738, 21228, 0, 0}, {7296, 848, 225, 0, 0},
+           {14811, 3381, 1136, 0, 0},   {23572, 12175, 6368, 0, 0},
+           {28088, 20063, 13566, 0, 0}, {29851, 24312, 19332, 0, 0},
+           {6297, 709, 194, 0, 0},      {14310, 2985, 859, 0, 0},
+           {24368, 13304, 6812, 0, 0},  {28956, 21795, 15562, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{25989, 19025, 15090, 0, 0}, {7962, 971, 311, 0, 0},
+           {15152, 3721, 1396, 0, 0},   {21705, 9593, 4765, 0, 0},
+           {26247, 16658, 10444, 0, 0}, {30004, 25264, 21114, 0, 0},
+           {7502, 401, 131, 0, 0},      {13714, 2215, 593, 0, 0},
+           {20629, 7556, 2961, 0, 0},   {25457, 14606, 8064, 0, 0},
+           {29371, 23604, 18694, 0, 0}, {6780, 560, 246, 0, 0},
+           {16515, 3856, 1242, 0, 0},   {23617, 11381, 5396, 0, 0},
+           {27080, 17853, 11272, 0, 0}, {30051, 25141, 20764, 0, 0},
+           {9624, 913, 325, 0, 0},      {16698, 4277, 1443, 0, 0},
+           {24066, 12301, 6251, 0, 0},  {27525, 18812, 12401, 0, 0},
+           {30147, 25433, 21201, 0, 0}, {6132, 428, 138, 0, 0},
+           {12778, 1718, 427, 0, 0},    {19525, 6663, 2453, 0, 0},
+           {24180, 13247, 6850, 0, 0},  {28051, 21183, 15464, 0, 0},
+           {6924, 476, 186, 0, 0},      {13678, 2133, 671, 0, 0},
+           {20805, 8222, 3829, 0, 0},   {26550, 16681, 10414, 0, 0},
+           {30428, 26160, 22342, 0, 0}, {4722, 192, 74, 0, 0},
+           {11590, 1455, 472, 0, 0},    {19282, 6584, 2898, 0, 0},
+           {25619, 14897, 9045, 0, 0},  {29935, 24810, 20509, 0, 0},
+           {5058, 240, 82, 0, 0},       {12094, 1692, 500, 0, 0},
+           {20355, 7813, 3525, 0, 0},   {26092, 15841, 9671, 0, 0},
+           {29802, 24435, 19849, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24129, 13429, 8339, 0, 0},  {8364, 931, 243, 0, 0},
+           {15771, 3343, 984, 0, 0},    {21515, 8534, 3619, 0, 0},
+           {26017, 15374, 8740, 0, 0},  {29278, 22938, 17577, 0, 0},
+           {6485, 297, 54, 0, 0},       {13169, 1600, 326, 0, 0},
+           {19622, 5814, 1875, 0, 0},   {24554, 12180, 5878, 0, 0},
+           {28069, 19687, 13468, 0, 0}, {4556, 310, 99, 0, 0},
+           {14174, 2452, 668, 0, 0},    {21549, 8360, 3534, 0, 0},
+           {25903, 15112, 8619, 0, 0},  {29090, 22406, 16762, 0, 0},
+           {6943, 632, 152, 0, 0},      {15455, 2915, 747, 0, 0},
+           {21571, 8297, 3296, 0, 0},   {25821, 14987, 8363, 0, 0},
+           {29000, 22108, 16507, 0, 0}, {5416, 268, 62, 0, 0},
+           {11918, 1300, 299, 0, 0},    {18747, 5061, 1635, 0, 0},
+           {23804, 11020, 4930, 0, 0},  {27331, 18103, 11581, 0, 0},
+           {6464, 276, 70, 0, 0},       {12359, 1388, 383, 0, 0},
+           {19086, 5546, 2136, 0, 0},   {23794, 11532, 6083, 0, 0},
+           {28534, 21103, 15834, 0, 0}, {6495, 411, 57, 0, 0},
+           {12096, 1526, 327, 0, 0},    {18596, 5514, 1866, 0, 0},
+           {22898, 10870, 5493, 0, 0},  {27604, 19262, 13498, 0, 0},
+           {6043, 309, 40, 0, 0},       {11777, 1326, 241, 0, 0},
+           {19697, 6334, 1957, 0, 0},   {24584, 12678, 6026, 0, 0},
+           {27965, 19513, 12873, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{25213, 17826, 14267, 0, 0}, {8358, 1590, 481, 0, 0},
+           {18374, 6030, 2515, 0, 0},   {24355, 13214, 7573, 0, 0},
+           {28002, 19844, 13983, 0, 0}, {30739, 26962, 23561, 0, 0},
+           {5992, 404, 105, 0, 0},      {14036, 2801, 837, 0, 0},
+           {21763, 8982, 3916, 0, 0},   {26302, 15859, 9258, 0, 0},
+           {29724, 24130, 19349, 0, 0}, {3560, 186, 64, 0, 0},
+           {12700, 1911, 560, 0, 0},    {20765, 7683, 3173, 0, 0},
+           {25821, 15018, 8579, 0, 0},  {29523, 23665, 18761, 0, 0},
+           {5409, 303, 99, 0, 0},       {13347, 2154, 594, 0, 0},
+           {20853, 7758, 3189, 0, 0},   {25818, 15092, 8694, 0, 0},
+           {29761, 24295, 19672, 0, 0}, {3766, 92, 33, 0, 0},
+           {10666, 919, 192, 0, 0},     {18360, 4759, 1363, 0, 0},
+           {23741, 11089, 4837, 0, 0},  {28074, 20090, 14020, 0, 0},
+           {4552, 240, 86, 0, 0},       {11919, 1504, 450, 0, 0},
+           {20012, 6953, 3017, 0, 0},   {25203, 13967, 7845, 0, 0},
+           {29259, 23235, 18291, 0, 0}, {2635, 81, 29, 0, 0},
+           {9705, 858, 253, 0, 0},      {18180, 4717, 1636, 0, 0},
+           {23683, 11119, 5311, 0, 0},  {28507, 21114, 15504, 0, 0},
+           {3250, 77, 20, 0, 0},        {10317, 809, 155, 0, 0},
+           {17904, 4046, 1068, 0, 0},   {23073, 9804, 4052, 0, 0},
+           {27836, 19410, 13266, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{26303, 15810, 11080, 0, 0}, {7569, 1254, 408, 0, 0},
+           {17994, 5619, 2161, 0, 0},   {23511, 11330, 5796, 0, 0},
+           {27045, 17585, 10886, 0, 0}, {29618, 23889, 19037, 0, 0},
+           {5779, 506, 86, 0, 0},       {15372, 2831, 683, 0, 0},
+           {21381, 7867, 2984, 0, 0},   {25479, 13947, 7220, 0, 0},
+           {29034, 22191, 16682, 0, 0}, {3040, 267, 73, 0, 0},
+           {15337, 3067, 865, 0, 0},    {22847, 9942, 4468, 0, 0},
+           {26872, 17334, 10700, 0, 0}, {29338, 23122, 18011, 0, 0},
+           {4154, 257, 63, 0, 0},       {13404, 2130, 505, 0, 0},
+           {19639, 6514, 2366, 0, 0},   {24014, 12284, 6328, 0, 0},
+           {28390, 21161, 15658, 0, 0}, {2476, 97, 24, 0, 0},
+           {10988, 1165, 267, 0, 0},    {18454, 4939, 1477, 0, 0},
+           {23157, 10441, 4505, 0, 0},  {27878, 19681, 13703, 0, 0},
+           {6906, 201, 35, 0, 0},       {11974, 718, 201, 0, 0},
+           {15525, 2143, 514, 0, 0},    {19485, 5140, 1294, 0, 0},
+           {23099, 10236, 3850, 0, 0},  {5333, 71, 20, 0, 0},
+           {7846, 378, 54, 0, 0},       {11319, 1264, 232, 0, 0},
+           {16376, 3039, 936, 0, 0},    {21076, 7884, 3692, 0, 0},
+           {8575, 478, 33, 0, 0},       {13859, 1664, 205, 0, 0},
+           {20532, 5927, 1365, 0, 0},   {24597, 10928, 3686, 0, 0},
+           {25544, 15488, 7493, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{29690, 25929, 22878, 0, 0}, {18931, 12318, 8289, 0, 0},
+           {26854, 18546, 13440, 0, 0}, {28902, 22501, 18006, 0, 0},
+           {30156, 25560, 21726, 0, 0}, {31701, 29777, 27992, 0, 0},
+           {6951, 1122, 239, 0, 0},     {19060, 6430, 2383, 0, 0},
+           {25440, 14183, 7898, 0, 0},  {28077, 19688, 13492, 0, 0},
+           {30943, 27515, 24416, 0, 0}, {3382, 453, 144, 0, 0},
+           {15608, 3767, 1408, 0, 0},   {23166, 10906, 5372, 0, 0},
+           {26853, 16996, 10620, 0, 0}, {29982, 24989, 20721, 0, 0},
+           {3522, 318, 105, 0, 0},      {14072, 2839, 950, 0, 0},
+           {22258, 9399, 4208, 0, 0},   {26539, 16269, 9643, 0, 0},
+           {30160, 25320, 21063, 0, 0}, {2015, 58, 20, 0, 0},
+           {11130, 1281, 265, 0, 0},    {19831, 5914, 1898, 0, 0},
+           {24586, 12172, 5798, 0, 0},  {29131, 22499, 17271, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{27524, 20618, 15862, 0, 0}, {12282, 5910, 3067, 0, 0},
+           {25012, 14451, 9033, 0, 0},  {29316, 23512, 19622, 0, 0},
+           {30748, 27562, 24539, 0, 0}, {30967, 27775, 24865, 0, 0},
+           {5717, 910, 237, 0, 0},      {16780, 5237, 2149, 0, 0},
+           {23580, 11284, 6049, 0, 0},  {26495, 15582, 8968, 0, 0},
+           {29660, 23413, 18004, 0, 0}, {1692, 248, 88, 0, 0},
+           {14649, 2731, 918, 0, 0},    {22524, 9799, 5296, 0, 0},
+           {28076, 18691, 13495, 0, 0}, {29074, 21091, 15212, 0, 0},
+           {2708, 187, 48, 0, 0},       {11757, 1993, 648, 0, 0},
+           {20837, 7948, 3479, 0, 0},   {25649, 15106, 8412, 0, 0},
+           {28935, 22062, 16464, 0, 0}, {814, 37, 20, 0, 0},
+           {8855, 1044, 279, 0, 0},     {17248, 4708, 1482, 0, 0},
+           {21251, 9760, 4197, 0, 0},   {26575, 18260, 12139, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{31733, 29961, 28612, 0, 0}, {19606, 14630, 11829, 0, 0},
+           {30072, 26135, 24013, 0, 0}, {31395, 28607, 25915, 0, 0},
+           {31669, 30022, 28052, 0, 0}, {32428, 31747, 31169, 0, 0},
+           {9942, 2349, 633, 0, 0},     {22373, 11006, 5826, 0, 0},
+           {28042, 20361, 15407, 0, 0}, {30321, 25688, 22175, 0, 0},
+           {31541, 29051, 26757, 0, 0}, {4612, 1344, 834, 0, 0},
+           {15853, 5014, 2395, 0, 0},   {23620, 11778, 6337, 0, 0},
+           {26818, 17253, 11620, 0, 0}, {30276, 25441, 21242, 0, 0},
+           {2166, 291, 98, 0, 0},       {12742, 2813, 1200, 0, 0},
+           {21548, 9140, 4663, 0, 0},   {26116, 15749, 9795, 0, 0},
+           {29704, 24232, 19725, 0, 0}, {999, 44, 20, 0, 0},
+           {10538, 1881, 395, 0, 0},    {20534, 7689, 3037, 0, 0},
+           {25442, 13952, 7415, 0, 0},  {28835, 21861, 16152, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{23872, 16541, 12138, 0, 0}, {9139, 986, 241, 0, 0},
+           {17595, 5013, 1447, 0, 0},   {22610, 11535, 5386, 0, 0},
+           {26348, 17911, 11210, 0, 0}, {29499, 24613, 20122, 0, 0},
+           {7933, 759, 272, 0, 0},      {16259, 4347, 1189, 0, 0},
+           {21811, 11254, 5350, 0, 0},  {24887, 16838, 10672, 0, 0},
+           {27380, 21808, 16850, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {12023, 1995, 675, 0, 0},
+           {17568, 5547, 1907, 0, 0},   {19736, 11895, 7101, 0, 0},
+           {20483, 14105, 9274, 0, 0},  {21205, 15287, 11279, 0, 0},
+           {6508, 786, 448, 0, 0},      {17371, 4685, 1668, 0, 0},
+           {23026, 13551, 7944, 0, 0},  {29507, 23139, 17406, 0, 0},
+           {31288, 28446, 25269, 0, 0}, {5169, 512, 308, 0, 0},
+           {15911, 5109, 1994, 0, 0},   {23217, 14478, 9020, 0, 0},
+           {29716, 23835, 18665, 0, 0}, {30747, 26858, 22981, 0, 0},
+           {3763, 753, 376, 0, 0},      {15091, 5074, 1905, 0, 0},
+           {23564, 15412, 9549, 0, 0},  {30365, 25252, 19954, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{21960, 10712, 5872, 0, 0},  {7029, 455, 92, 0, 0},
+           {15480, 2565, 547, 0, 0},    {21409, 7890, 2872, 0, 0},
+           {25819, 15001, 7875, 0, 0},  {28481, 20972, 14697, 0, 0},
+           {4888, 247, 63, 0, 0},       {13730, 1764, 354, 0, 0},
+           {20204, 6423, 2000, 0, 0},   {24499, 12821, 5989, 0, 0},
+           {27094, 18111, 11094, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {7026, 449, 97, 0, 0},
+           {13211, 1604, 314, 0, 0},    {19387, 6387, 2013, 0, 0},
+           {22667, 11302, 6046, 0, 0},  {23559, 13118, 5943, 0, 0},
+           {5661, 851, 336, 0, 0},      {14712, 3875, 1565, 0, 0},
+           {22568, 11334, 6004, 0, 0},  {28108, 19855, 13266, 0, 0},
+           {30400, 25838, 20264, 0, 0}, {5808, 610, 155, 0, 0},
+           {14140, 2763, 737, 0, 0},    {22535, 10326, 4536, 0, 0},
+           {27297, 18138, 11252, 0, 0}, {29533, 22001, 15659, 0, 0},
+           {5072, 328, 76, 0, 0},       {12736, 1601, 330, 0, 0},
+           {24068, 11427, 4326, 0, 0},  {27106, 17937, 10973, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{23064, 15474, 11636, 0, 0}, {6006, 490, 135, 0, 0},
+           {14386, 3148, 949, 0, 0},    {21877, 9293, 4045, 0, 0},
+           {26410, 16185, 9459, 0, 0},  {29520, 23650, 18627, 0, 0},
+           {5564, 195, 69, 0, 0},       {12950, 1944, 439, 0, 0},
+           {20996, 7648, 2727, 0, 0},   {25773, 14735, 7729, 0, 0},
+           {29016, 22326, 16670, 0, 0}, {5546, 512, 209, 0, 0},
+           {17412, 4369, 1293, 0, 0},   {23947, 12133, 5711, 0, 0},
+           {27257, 18364, 11529, 0, 0}, {29833, 24546, 19717, 0, 0},
+           {7893, 648, 239, 0, 0},      {17535, 4503, 1323, 0, 0},
+           {24163, 12198, 5836, 0, 0},  {27337, 18355, 11572, 0, 0},
+           {29774, 24427, 19545, 0, 0}, {4567, 164, 68, 0, 0},
+           {11727, 1322, 312, 0, 0},    {19547, 6555, 2293, 0, 0},
+           {24513, 13383, 6731, 0, 0},  {27838, 20183, 13938, 0, 0},
+           {4000, 320, 141, 0, 0},      {13063, 2207, 747, 0, 0},
+           {21196, 9179, 4548, 0, 0},   {27236, 17734, 11322, 0, 0},
+           {30308, 25618, 21312, 0, 0}, {2894, 149, 69, 0, 0},
+           {11147, 1697, 567, 0, 0},    {20257, 8021, 3776, 0, 0},
+           {26487, 16373, 10020, 0, 0}, {29522, 23490, 18271, 0, 0},
+           {3053, 143, 56, 0, 0},       {11810, 1757, 485, 0, 0},
+           {21535, 9097, 3962, 0, 0},   {26756, 16640, 9900, 0, 0},
+           {29341, 22917, 17354, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{21752, 10657, 5974, 0, 0},  {6822, 411, 91, 0, 0},
+           {14878, 2316, 516, 0, 0},    {21090, 7626, 2952, 0, 0},
+           {26048, 15234, 8184, 0, 0},  {28538, 21103, 14948, 0, 0},
+           {4368, 145, 21, 0, 0},       {11604, 1100, 193, 0, 0},
+           {19196, 5380, 1586, 0, 0},   {24534, 12018, 5410, 0, 0},
+           {27703, 18713, 11871, 0, 0}, {3787, 221, 63, 0, 0},
+           {14087, 2225, 529, 0, 0},    {21849, 8693, 3482, 0, 0},
+           {26337, 15569, 8691, 0, 0},  {28949, 22304, 16150, 0, 0},
+           {5898, 301, 75, 0, 0},       {13727, 1937, 421, 0, 0},
+           {20974, 7557, 2752, 0, 0},   {25880, 14749, 7798, 0, 0},
+           {28398, 20405, 13776, 0, 0}, {3190, 98, 24, 0, 0},
+           {9609, 761, 155, 0, 0},      {17453, 4099, 1092, 0, 0},
+           {23470, 10161, 3986, 0, 0},  {26624, 16855, 9800, 0, 0},
+           {4658, 269, 99, 0, 0},       {11194, 1831, 753, 0, 0},
+           {20009, 7950, 4041, 0, 0},   {26223, 16007, 9726, 0, 0},
+           {29119, 22171, 15935, 0, 0}, {4605, 216, 40, 0, 0},
+           {10667, 1299, 304, 0, 0},    {19608, 7296, 2625, 0, 0},
+           {25465, 14084, 7300, 0, 0},  {27527, 18793, 11813, 0, 0},
+           {4368, 137, 24, 0, 0},       {10664, 975, 165, 0, 0},
+           {19211, 6197, 1922, 0, 0},   {25019, 12907, 6093, 0, 0},
+           {27895, 18738, 11534, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{22968, 15133, 11695, 0, 0}, {6615, 883, 241, 0, 0},
+           {17730, 4916, 1762, 0, 0},   {24050, 12204, 6282, 0, 0},
+           {27640, 18692, 12254, 0, 0}, {30132, 25202, 20843, 0, 0},
+           {5217, 264, 67, 0, 0},       {14458, 2714, 668, 0, 0},
+           {22557, 9348, 3686, 0, 0},   {26546, 15892, 8852, 0, 0},
+           {29306, 22814, 17270, 0, 0}, {2777, 135, 47, 0, 0},
+           {12885, 2017, 567, 0, 0},    {21627, 8584, 3483, 0, 0},
+           {26348, 15828, 8994, 0, 0},  {29376, 23015, 17650, 0, 0},
+           {4303, 152, 56, 0, 0},       {12918, 2066, 524, 0, 0},
+           {21785, 8744, 3545, 0, 0},   {26474, 15998, 9186, 0, 0},
+           {29524, 23485, 18259, 0, 0}, {2745, 51, 20, 0, 0},
+           {9828, 736, 142, 0, 0},      {18486, 4840, 1295, 0, 0},
+           {24206, 11441, 4854, 0, 0},  {27922, 19375, 12849, 0, 0},
+           {2787, 178, 73, 0, 0},       {12303, 1805, 602, 0, 0},
+           {21289, 9189, 4573, 0, 0},   {26852, 17120, 10695, 0, 0},
+           {29737, 24163, 19370, 0, 0}, {1622, 77, 29, 0, 0},
+           {9662, 1044, 324, 0, 0},     {18985, 6030, 2329, 0, 0},
+           {24916, 13300, 6961, 0, 0},  {28908, 21644, 15915, 0, 0},
+           {1754, 44, 20, 0, 0},        {9139, 659, 140, 0, 0},
+           {18021, 4653, 1365, 0, 0},   {24223, 11526, 5290, 0, 0},
+           {28194, 19987, 13701, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{23583, 13074, 8080, 0, 0},  {6687, 783, 147, 0, 0},
+           {16753, 3768, 981, 0, 0},    {22226, 9078, 3562, 0, 0},
+           {26036, 14823, 8091, 0, 0},  {28852, 21729, 16046, 0, 0},
+           {4544, 202, 24, 0, 0},       {13668, 1630, 283, 0, 0},
+           {20240, 6148, 1889, 0, 0},   {25027, 12491, 5883, 0, 0},
+           {28202, 19923, 13778, 0, 0}, {2835, 175, 50, 0, 0},
+           {15098, 2435, 613, 0, 0},    {22383, 9168, 3859, 0, 0},
+           {26525, 16532, 10361, 0, 0}, {28792, 22379, 16751, 0, 0},
+           {4391, 207, 30, 0, 0},       {13402, 1593, 286, 0, 0},
+           {19441, 5593, 1674, 0, 0},   {24510, 11999, 5625, 0, 0},
+           {28065, 19570, 13241, 0, 0}, {1682, 62, 20, 0, 0},
+           {9915, 866, 185, 0, 0},      {18009, 4582, 1349, 0, 0},
+           {23484, 10386, 4420, 0, 0},  {27183, 17576, 10900, 0, 0},
+           {4477, 116, 22, 0, 0},       {12919, 661, 197, 0, 0},
+           {17934, 5950, 3554, 0, 0},   {22462, 10174, 4096, 0, 0},
+           {26153, 15384, 9384, 0, 0},  {3821, 164, 23, 0, 0},
+           {7143, 479, 122, 0, 0},      {14010, 4096, 1365, 0, 0},
+           {22751, 9338, 4245, 0, 0},   {25906, 17499, 10637, 0, 0},
+           {8835, 259, 29, 0, 0},       {12841, 1273, 137, 0, 0},
+           {20865, 6745, 2147, 0, 0},   {25742, 12674, 5516, 0, 0},
+           {26770, 14662, 8331, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28312, 21494, 17235, 0, 0}, {11549, 3689, 1152, 0, 0},
+           {21595, 8994, 4201, 0, 0},   {25486, 14475, 8505, 0, 0},
+           {27878, 19482, 13653, 0, 0}, {30878, 27260, 24109, 0, 0},
+           {6117, 632, 121, 0, 0},      {18138, 4514, 1313, 0, 0},
+           {24052, 11481, 5373, 0, 0},  {27153, 17437, 10760, 0, 0},
+           {30093, 25068, 20618, 0, 0}, {2814, 242, 78, 0, 0},
+           {16642, 3786, 1135, 0, 0},   {23738, 11407, 5416, 0, 0},
+           {27357, 17975, 11497, 0, 0}, {29825, 24346, 19605, 0, 0},
+           {3229, 167, 38, 0, 0},       {14643, 2383, 567, 0, 0},
+           {22346, 8678, 3300, 0, 0},   {26300, 15281, 8330, 0, 0},
+           {29798, 24115, 19237, 0, 0}, {1856, 53, 20, 0, 0},
+           {12102, 1395, 271, 0, 0},    {20259, 6128, 1851, 0, 0},
+           {24710, 12139, 5478, 0, 0},  {28537, 20762, 14716, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{22566, 12135, 7284, 0, 0},  {5432, 1323, 416, 0, 0},
+           {20348, 8384, 4216, 0, 0},   {25120, 14653, 8912, 0, 0},
+           {27106, 18427, 12866, 0, 0}, {29157, 22440, 17378, 0, 0},
+           {1823, 152, 32, 0, 0},       {14086, 2263, 515, 0, 0},
+           {21255, 7432, 2565, 0, 0},   {25319, 13316, 6620, 0, 0},
+           {28286, 19717, 13882, 0, 0}, {746, 78, 21, 0, 0},
+           {14190, 2267, 622, 0, 0},    {21519, 9400, 4137, 0, 0},
+           {27123, 15810, 10610, 0, 0}, {27759, 21324, 16131, 0, 0},
+           {1411, 58, 20, 0, 0},        {11216, 1274, 264, 0, 0},
+           {18877, 5091, 1428, 0, 0},   {23717, 10670, 4596, 0, 0},
+           {27578, 19391, 13282, 0, 0}, {404, 28, 20, 0, 0},
+           {7929, 861, 217, 0, 0},      {15608, 3989, 1072, 0, 0},
+           {20316, 8631, 3166, 0, 0},   {26603, 17379, 10291, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{30193, 25487, 21691, 0, 0}, {18766, 11902, 7366, 0, 0},
+           {26425, 17712, 13110, 0, 0}, {28294, 20910, 15727, 0, 0},
+           {29903, 24469, 20234, 0, 0}, {31424, 28819, 26377, 0, 0},
+           {8048, 1529, 309, 0, 0},     {20183, 7412, 2800, 0, 0},
+           {25587, 14522, 8324, 0, 0},  {27743, 19101, 12883, 0, 0},
+           {30247, 25464, 21163, 0, 0}, {2860, 516, 184, 0, 0},
+           {15347, 3612, 1193, 0, 0},   {22879, 10580, 4986, 0, 0},
+           {26890, 17121, 10645, 0, 0}, {29954, 24103, 19445, 0, 0},
+           {2585, 200, 55, 0, 0},       {14240, 2573, 719, 0, 0},
+           {21786, 8162, 3111, 0, 0},   {25811, 14603, 7537, 0, 0},
+           {29260, 22650, 17300, 0, 0}, {1007, 32, 20, 0, 0},
+           {11727, 1440, 222, 0, 0},    {20200, 6036, 1602, 0, 0},
+           {24716, 12048, 5035, 0, 0},  {28432, 20576, 14372, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{25706, 16296, 10449, 0, 0}, {8230, 507, 94, 0, 0},
+           {19093, 4727, 989, 0, 0},    {24178, 12094, 5137, 0, 0},
+           {27083, 18093, 10755, 0, 0}, {29113, 22870, 17037, 0, 0},
+           {6275, 350, 110, 0, 0},      {16392, 3426, 678, 0, 0},
+           {22174, 10119, 3798, 0, 0},  {24592, 15598, 8465, 0, 0},
+           {27163, 20074, 13629, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {8880, 866, 226, 0, 0},
+           {14156, 3081, 781, 0, 0},    {16523, 7916, 3519, 0, 0},
+           {17003, 10160, 5209, 0, 0},  {12873, 8069, 5258, 0, 0},
+           {4367, 556, 311, 0, 0},      {17494, 4943, 1788, 0, 0},
+           {23404, 14640, 8436, 0, 0},  {30485, 24575, 17686, 0, 0},
+           {31540, 28796, 24887, 0, 0}, {3313, 299, 148, 0, 0},
+           {14787, 4523, 1380, 0, 0},   {21847, 12670, 6528, 0, 0},
+           {29025, 20939, 14111, 0, 0}, {30394, 23175, 17053, 0, 0},
+           {1700, 302, 133, 0, 0},      {12447, 3196, 797, 0, 0},
+           {21997, 12513, 5649, 0, 0},  {29973, 22358, 15407, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{23448, 10666, 4928, 0, 0},  {5711, 304, 44, 0, 0},
+           {16437, 2500, 459, 0, 0},    {22449, 8833, 3048, 0, 0},
+           {26579, 16320, 8662, 0, 0},  {29179, 21884, 13960, 0, 0},
+           {3742, 144, 20, 0, 0},       {13542, 1261, 181, 0, 0},
+           {20076, 5847, 1565, 0, 0},   {25719, 13236, 5133, 0, 0},
+           {25041, 17099, 9516, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {4712, 143, 20, 0, 0},
+           {10385, 693, 99, 0, 0},      {17351, 5670, 1019, 0, 0},
+           {14641, 6275, 5578, 0, 0},   {27307, 16384, 10923, 0, 0},
+           {4786, 677, 184, 0, 0},      {13723, 2900, 796, 0, 0},
+           {22371, 10502, 4836, 0, 0},  {26778, 19071, 11268, 0, 0},
+           {30976, 25856, 17664, 0, 0}, {4570, 267, 50, 0, 0},
+           {11234, 1247, 199, 0, 0},    {21659, 7551, 2751, 0, 0},
+           {27097, 17644, 6617, 0, 0},  {28087, 18725, 14043, 0, 0},
+           {4080, 188, 27, 0, 0},       {10192, 689, 107, 0, 0},
+           {22141, 10627, 4428, 0, 0},  {23406, 18725, 4681, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{25014, 15820, 10626, 0, 0}, {7098, 438, 77, 0, 0},
+           {17105, 3543, 774, 0, 0},    {22890, 9480, 3610, 0, 0},
+           {26349, 15680, 8432, 0, 0},  {28909, 21765, 15729, 0, 0},
+           {5206, 173, 43, 0, 0},       {15193, 2180, 369, 0, 0},
+           {21949, 7930, 2459, 0, 0},   {25644, 14082, 6852, 0, 0},
+           {28289, 20080, 13428, 0, 0}, {4383, 292, 95, 0, 0},
+           {17462, 3763, 830, 0, 0},    {23831, 11153, 4446, 0, 0},
+           {26786, 17165, 9982, 0, 0},  {29148, 22501, 16632, 0, 0},
+           {5488, 304, 101, 0, 0},      {17161, 3608, 764, 0, 0},
+           {23677, 10633, 4028, 0, 0},  {26536, 16136, 8748, 0, 0},
+           {28721, 21391, 15096, 0, 0}, {3548, 138, 50, 0, 0},
+           {13118, 1548, 306, 0, 0},    {19718, 6456, 1941, 0, 0},
+           {23540, 11898, 5300, 0, 0},  {26622, 17619, 10797, 0, 0},
+           {2599, 287, 145, 0, 0},      {15556, 3457, 1214, 0, 0},
+           {22857, 11457, 5886, 0, 0},  {28281, 19454, 12396, 0, 0},
+           {30198, 24996, 19879, 0, 0}, {1844, 155, 60, 0, 0},
+           {13278, 2562, 661, 0, 0},    {21536, 8770, 3492, 0, 0},
+           {25999, 14813, 7733, 0, 0},  {28370, 20145, 13554, 0, 0},
+           {2159, 141, 46, 0, 0},       {13398, 2186, 481, 0, 0},
+           {22311, 9149, 3359, 0, 0},   {26325, 15131, 7934, 0, 0},
+           {28123, 19532, 12662, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24142, 12497, 6552, 0, 0},  {6061, 362, 57, 0, 0},
+           {15769, 2439, 482, 0, 0},    {21323, 7645, 2482, 0, 0},
+           {26357, 13940, 7167, 0, 0},  {25967, 20310, 12520, 0, 0},
+           {2850, 86, 20, 0, 0},        {12119, 1029, 150, 0, 0},
+           {19889, 4995, 1187, 0, 0},   {24872, 11017, 4524, 0, 0},
+           {27508, 17898, 9070, 0, 0},  {3516, 175, 37, 0, 0},
+           {15696, 2308, 474, 0, 0},    {22115, 8625, 3403, 0, 0},
+           {26232, 15278, 8785, 0, 0},  {27839, 19598, 12683, 0, 0},
+           {4631, 250, 53, 0, 0},       {14597, 1984, 361, 0, 0},
+           {21331, 7332, 2309, 0, 0},   {25516, 14234, 6592, 0, 0},
+           {28642, 19415, 11790, 0, 0}, {1606, 42, 20, 0, 0},
+           {9751, 546, 67, 0, 0},       {17139, 3535, 722, 0, 0},
+           {23381, 10147, 3288, 0, 0},  {25846, 15152, 7758, 0, 0},
+           {3930, 503, 154, 0, 0},      {13067, 2562, 848, 0, 0},
+           {21554, 10358, 4835, 0, 0},  {27448, 18591, 9734, 0, 0},
+           {27719, 19887, 14941, 0, 0}, {5284, 297, 34, 0, 0},
+           {11692, 1242, 207, 0, 0},    {20061, 6465, 1557, 0, 0},
+           {24599, 11046, 4549, 0, 0},  {26723, 13362, 5726, 0, 0},
+           {5015, 196, 23, 0, 0},       {11936, 890, 115, 0, 0},
+           {19518, 5412, 1094, 0, 0},   {25050, 11260, 2910, 0, 0},
+           {25559, 14418, 7209, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{24892, 15867, 11027, 0, 0}, {8767, 870, 143, 0, 0},
+           {18239, 4809, 1317, 0, 0},   {24495, 11950, 5510, 0, 0},
+           {27490, 18095, 11258, 0, 0}, {29785, 23925, 18729, 0, 0},
+           {4752, 194, 36, 0, 0},       {15297, 2462, 467, 0, 0},
+           {22544, 8705, 3040, 0, 0},   {26166, 14814, 7716, 0, 0},
+           {28766, 21183, 15009, 0, 0}, {2578, 134, 29, 0, 0},
+           {15271, 2486, 498, 0, 0},    {22539, 9039, 3230, 0, 0},
+           {26424, 15557, 8328, 0, 0},  {28919, 21579, 15660, 0, 0},
+           {4198, 185, 42, 0, 0},       {15247, 2607, 530, 0, 0},
+           {22615, 9203, 3390, 0, 0},   {26313, 15427, 8325, 0, 0},
+           {28861, 21726, 15744, 0, 0}, {2079, 53, 20, 0, 0},
+           {11222, 928, 158, 0, 0},     {19221, 5187, 1309, 0, 0},
+           {23856, 11011, 4459, 0, 0},  {27220, 17688, 10722, 0, 0},
+           {1985, 228, 83, 0, 0},       {15228, 3240, 1100, 0, 0},
+           {22608, 11300, 5985, 0, 0},  {28044, 19375, 12714, 0, 0},
+           {30066, 24594, 19666, 0, 0}, {1120, 82, 26, 0, 0},
+           {11814, 1674, 431, 0, 0},    {20348, 7070, 2589, 0, 0},
+           {25464, 13448, 6520, 0, 0},  {28402, 20507, 13904, 0, 0},
+           {1187, 45, 20, 0, 0},        {11395, 1182, 243, 0, 0},
+           {20024, 6143, 1883, 0, 0},   {25337, 12446, 5818, 0, 0},
+           {28076, 19445, 12657, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24935, 14399, 8673, 0, 0},  {6118, 495, 66, 0, 0},
+           {16397, 2807, 577, 0, 0},    {21713, 8686, 3139, 0, 0},
+           {25876, 14124, 7368, 0, 0},  {27762, 19711, 13528, 0, 0},
+           {2934, 102, 20, 0, 0},       {13191, 1433, 198, 0, 0},
+           {20515, 6259, 1646, 0, 0},   {24777, 11996, 5057, 0, 0},
+           {27091, 16858, 9709, 0, 0},  {2659, 236, 48, 0, 0},
+           {16021, 2602, 516, 0, 0},    {22634, 9226, 3584, 0, 0},
+           {26977, 16592, 9212, 0, 0},  {28406, 22354, 15484, 0, 0},
+           {3276, 142, 20, 0, 0},       {12874, 1366, 243, 0, 0},
+           {19826, 5697, 1899, 0, 0},   {24422, 11552, 5363, 0, 0},
+           {26196, 15681, 8909, 0, 0},  {733, 33, 20, 0, 0},
+           {9811, 930, 150, 0, 0},      {18044, 4196, 996, 0, 0},
+           {22404, 8769, 3215, 0, 0},   {25764, 14335, 7113, 0, 0},
+           {5240, 491, 87, 0, 0},       {15809, 1597, 672, 0, 0},
+           {22282, 9175, 4806, 0, 0},   {24576, 16384, 9557, 0, 0},
+           {23831, 14895, 11916, 0, 0}, {5053, 766, 153, 0, 0},
+           {17695, 3277, 1092, 0, 0},   {21504, 8192, 4096, 0, 0},
+           {30427, 14043, 9362, 0, 0},  {25486, 14564, 7282, 0, 0},
+           {4221, 555, 111, 0, 0},      {11980, 2995, 529, 0, 0},
+           {25988, 11299, 2260, 0, 0},  {26810, 17873, 8937, 0, 0},
+           {16384, 10923, 5461, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{26776, 18464, 13003, 0, 0}, {10156, 1530, 312, 0, 0},
+           {19312, 5606, 1681, 0, 0},   {24767, 12706, 6264, 0, 0},
+           {27600, 18663, 12004, 0, 0}, {30136, 24997, 20383, 0, 0},
+           {5734, 424, 59, 0, 0},       {16918, 3353, 771, 0, 0},
+           {23274, 9992, 3927, 0, 0},   {26617, 15938, 8799, 0, 0},
+           {29307, 22729, 17046, 0, 0}, {2634, 199, 37, 0, 0},
+           {17130, 3346, 823, 0, 0},    {23618, 10903, 4550, 0, 0},
+           {27121, 17049, 10092, 0, 0}, {29366, 22996, 17291, 0, 0},
+           {4238, 182, 33, 0, 0},       {15629, 2470, 476, 0, 0},
+           {22568, 8729, 3083, 0, 0},   {26349, 15094, 7982, 0, 0},
+           {29224, 22543, 16944, 0, 0}, {1435, 42, 20, 0, 0},
+           {12150, 1281, 224, 0, 0},    {19867, 5551, 1536, 0, 0},
+           {24144, 11034, 4597, 0, 0},  {27664, 18577, 12020, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{21562, 11678, 6207, 0, 0},  {4009, 489, 97, 0, 0},
+           {18597, 4816, 1199, 0, 0},   {23025, 9861, 3627, 0, 0},
+           {25897, 14882, 7900, 0, 0},  {27808, 19616, 13453, 0, 0},
+           {1691, 107, 20, 0, 0},       {13368, 1573, 253, 0, 0},
+           {20016, 5910, 1728, 0, 0},   {24398, 10670, 4177, 0, 0},
+           {27311, 17395, 10470, 0, 0}, {1071, 62, 20, 0, 0},
+           {14908, 2111, 435, 0, 0},    {20258, 7956, 3507, 0, 0},
+           {26588, 13644, 8046, 0, 0},  {27727, 19220, 14809, 0, 0},
+           {1216, 52, 20, 0, 0},        {10860, 999, 145, 0, 0},
+           {18298, 4567, 1203, 0, 0},   {23275, 9786, 4160, 0, 0},
+           {25910, 15528, 8631, 0, 0},  {225, 16, 12, 0, 0},
+           {8482, 671, 102, 0, 0},      {16810, 3551, 744, 0, 0},
+           {22561, 8534, 2810, 0, 0},   {25839, 14463, 7116, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28631, 21921, 17086, 0, 0}, {14944, 5767, 2710, 0, 0},
+           {22564, 9972, 4477, 0, 0},   {26692, 16833, 10643, 0, 0},
+           {28916, 21831, 15952, 0, 0}, {30516, 26444, 22637, 0, 0},
+           {6928, 752, 106, 0, 0},      {17659, 4500, 1237, 0, 0},
+           {23383, 10537, 4428, 0, 0},  {26686, 16096, 9289, 0, 0},
+           {29450, 23341, 18087, 0, 0}, {2174, 194, 50, 0, 0},
+           {15932, 3216, 909, 0, 0},    {23212, 10226, 4412, 0, 0},
+           {26463, 16043, 9228, 0, 0},  {29392, 22873, 17584, 0, 0},
+           {3385, 151, 23, 0, 0},       {13877, 1959, 367, 0, 0},
+           {21080, 6826, 2081, 0, 0},   {25300, 13299, 6117, 0, 0},
+           {28859, 21410, 15756, 0, 0}, {1204, 32, 20, 0, 0},
+           {11862, 1157, 168, 0, 0},    {19577, 5147, 1231, 0, 0},
+           {24000, 10739, 4092, 0, 0},  {27689, 18659, 11862, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseRangeCdf
+    [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+    [kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount + 1] = {
+        {{{{18470, 12050, 8594, 0, 0},  {20232, 13167, 8979, 0, 0},
+           {24056, 17717, 13265, 0, 0}, {26598, 21441, 17334, 0, 0},
+           {28026, 23842, 20230, 0, 0}, {28965, 25451, 22222, 0, 0},
+           {31072, 29451, 27897, 0, 0}, {18376, 12817, 10012, 0, 0},
+           {16790, 9550, 5950, 0, 0},   {20581, 13294, 8879, 0, 0},
+           {23592, 17128, 12509, 0, 0}, {25700, 20113, 15740, 0, 0},
+           {27112, 22326, 18296, 0, 0}, {30188, 27776, 25524, 0, 0},
+           {20632, 14719, 11342, 0, 0}, {18984, 12047, 8287, 0, 0},
+           {21932, 15147, 10868, 0, 0}, {24396, 18324, 13921, 0, 0},
+           {26245, 20989, 16768, 0, 0}, {27431, 22870, 19008, 0, 0},
+           {29734, 26908, 24306, 0, 0}},
+          {{16801, 9863, 6482, 0, 0},   {19234, 12114, 8189, 0, 0},
+           {23264, 16676, 12233, 0, 0}, {25793, 20200, 15865, 0, 0},
+           {27404, 22677, 18748, 0, 0}, {28411, 24398, 20911, 0, 0},
+           {30262, 27834, 25550, 0, 0}, {9736, 3953, 1832, 0, 0},
+           {13228, 6064, 3049, 0, 0},   {17610, 9799, 5671, 0, 0},
+           {21360, 13903, 9118, 0, 0},  {23883, 17320, 12518, 0, 0},
+           {25660, 19915, 15352, 0, 0}, {28537, 24727, 21288, 0, 0},
+           {12945, 6278, 3612, 0, 0},   {13878, 6839, 3836, 0, 0},
+           {17108, 9277, 5335, 0, 0},   {20621, 12992, 8280, 0, 0},
+           {23040, 15994, 11119, 0, 0}, {24849, 18491, 13702, 0, 0},
+           {27328, 22598, 18583, 0, 0}}},
+         {{{18362, 11906, 8354, 0, 0},  {20944, 13861, 9659, 0, 0},
+           {24511, 18375, 13965, 0, 0}, {26908, 22021, 17990, 0, 0},
+           {28293, 24282, 20784, 0, 0}, {29162, 25814, 22725, 0, 0},
+           {31032, 29358, 27720, 0, 0}, {18338, 12722, 9886, 0, 0},
+           {17175, 9869, 6059, 0, 0},   {20666, 13400, 8957, 0, 0},
+           {23709, 17184, 12506, 0, 0}, {25769, 20165, 15720, 0, 0},
+           {27084, 22271, 18215, 0, 0}, {29946, 27330, 24906, 0, 0},
+           {16983, 11183, 8409, 0, 0},  {14421, 7539, 4502, 0, 0},
+           {17794, 10281, 6379, 0, 0},  {21345, 14087, 9497, 0, 0},
+           {23905, 17418, 12760, 0, 0}, {25615, 19916, 15490, 0, 0},
+           {29061, 25732, 22786, 0, 0}},
+          {{17308, 11072, 7299, 0, 0},  {20598, 13519, 9577, 0, 0},
+           {24045, 17741, 13436, 0, 0}, {26340, 21064, 16894, 0, 0},
+           {27846, 23476, 19716, 0, 0}, {28629, 25073, 21758, 0, 0},
+           {30477, 28260, 26170, 0, 0}, {12912, 5848, 2940, 0, 0},
+           {14845, 7479, 3976, 0, 0},   {18490, 10800, 6471, 0, 0},
+           {21858, 14632, 9818, 0, 0},  {24345, 17953, 13141, 0, 0},
+           {25997, 20485, 15994, 0, 0}, {28694, 25018, 21687, 0, 0},
+           {12916, 6694, 4096, 0, 0},   {13397, 6658, 3779, 0, 0},
+           {16503, 8895, 5105, 0, 0},   {20010, 12390, 7816, 0, 0},
+           {22673, 15670, 10807, 0, 0}, {24518, 18140, 13317, 0, 0},
+           {27563, 23023, 19146, 0, 0}}},
+         {{{22205, 16535, 13005, 0, 0}, {22974, 16746, 12964, 0, 0},
+           {26018, 20823, 17009, 0, 0}, {27805, 23582, 20016, 0, 0},
+           {28923, 25333, 22141, 0, 0}, {29717, 26683, 23934, 0, 0},
+           {31457, 30172, 28938, 0, 0}, {21522, 16364, 13079, 0, 0},
+           {20453, 13857, 10037, 0, 0}, {22211, 15673, 11479, 0, 0},
+           {24632, 18762, 14519, 0, 0}, {26420, 21294, 17203, 0, 0},
+           {27572, 23113, 19368, 0, 0}, {30419, 28242, 26181, 0, 0},
+           {19431, 14038, 11199, 0, 0}, {13462, 6697, 3886, 0, 0},
+           {16816, 9228, 5514, 0, 0},   {20359, 12834, 8338, 0, 0},
+           {23008, 16062, 11379, 0, 0}, {24764, 18548, 13950, 0, 0},
+           {28630, 24974, 21807, 0, 0}},
+          {{21898, 16084, 11819, 0, 0}, {23104, 17538, 14088, 0, 0},
+           {25882, 20659, 17360, 0, 0}, {27943, 23868, 20463, 0, 0},
+           {29138, 25606, 22454, 0, 0}, {29732, 26339, 23381, 0, 0},
+           {31097, 29472, 27828, 0, 0}, {18949, 13609, 9742, 0, 0},
+           {20784, 13660, 9648, 0, 0},  {22078, 15558, 11105, 0, 0},
+           {24784, 18614, 14435, 0, 0}, {25900, 20474, 16644, 0, 0},
+           {27494, 23774, 19900, 0, 0}, {29780, 26997, 24344, 0, 0},
+           {13032, 6121, 3627, 0, 0},   {13835, 6698, 3784, 0, 0},
+           {16989, 9720, 5568, 0, 0},   {20130, 12707, 8236, 0, 0},
+           {22076, 15223, 10548, 0, 0}, {23551, 17517, 12714, 0, 0},
+           {27690, 23484, 20174, 0, 0}}},
+         {{{30437, 29106, 27524, 0, 0}, {29877, 27997, 26623, 0, 0},
+           {28170, 25145, 23039, 0, 0}, {29248, 25923, 23569, 0, 0},
+           {29351, 26649, 23444, 0, 0}, {30167, 27356, 25383, 0, 0},
+           {32168, 31595, 31024, 0, 0}, {25096, 19482, 15299, 0, 0},
+           {28536, 24976, 21975, 0, 0}, {29853, 27451, 25371, 0, 0},
+           {30450, 28412, 26616, 0, 0}, {30641, 28768, 27214, 0, 0},
+           {30918, 29290, 27493, 0, 0}, {31791, 30835, 29925, 0, 0},
+           {14488, 8381, 4779, 0, 0},   {16916, 10097, 6583, 0, 0},
+           {18923, 11817, 7979, 0, 0},  {21713, 14802, 10639, 0, 0},
+           {23630, 17346, 12967, 0, 0}, {25314, 19623, 15312, 0, 0},
+           {29398, 26375, 23755, 0, 0}},
+          {{26926, 23539, 21930, 0, 0}, {30455, 29277, 28492, 0, 0},
+           {29770, 26664, 25272, 0, 0}, {30348, 25321, 22900, 0, 0},
+           {29734, 24273, 21845, 0, 0}, {28692, 23831, 21793, 0, 0},
+           {31682, 30398, 29469, 0, 0}, {23054, 15514, 12324, 0, 0},
+           {24225, 19070, 15645, 0, 0}, {27850, 23761, 20858, 0, 0},
+           {28639, 25236, 22215, 0, 0}, {30404, 27235, 24710, 0, 0},
+           {30934, 29222, 27205, 0, 0}, {31295, 29860, 28635, 0, 0},
+           {17363, 11575, 7149, 0, 0},  {17077, 10816, 6207, 0, 0},
+           {19806, 13574, 8603, 0, 0},  {22496, 14913, 10639, 0, 0},
+           {24180, 17498, 12050, 0, 0}, {24086, 18099, 13268, 0, 0},
+           {27898, 23132, 19563, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{17773, 11427, 8019, 0, 0},  {19610, 12479, 8167, 0, 0},
+           {23827, 17442, 12892, 0, 0}, {26471, 21227, 16961, 0, 0},
+           {27951, 23739, 19992, 0, 0}, {29037, 25495, 22141, 0, 0},
+           {30921, 29151, 27414, 0, 0}, {18296, 13109, 10425, 0, 0},
+           {15962, 8606, 5235, 0, 0},   {19868, 12364, 8055, 0, 0},
+           {23357, 16656, 11971, 0, 0}, {25712, 20071, 15620, 0, 0},
+           {27224, 22429, 18308, 0, 0}, {29814, 27064, 24449, 0, 0},
+           {20304, 14697, 11414, 0, 0}, {17286, 10240, 6734, 0, 0},
+           {20698, 13499, 9144, 0, 0},  {23815, 17362, 12662, 0, 0},
+           {25741, 20038, 15548, 0, 0}, {26881, 21855, 17628, 0, 0},
+           {28975, 25490, 22321, 0, 0}},
+          {{17197, 10536, 7019, 0, 0},  {18262, 11193, 7394, 0, 0},
+           {22579, 15679, 11199, 0, 0}, {25452, 19467, 14853, 0, 0},
+           {26985, 21856, 17578, 0, 0}, {28008, 23613, 19680, 0, 0},
+           {29775, 26802, 23994, 0, 0}, {9344, 3865, 1990, 0, 0},
+           {11993, 5102, 2478, 0, 0},   {16294, 8358, 4469, 0, 0},
+           {20297, 12588, 7781, 0, 0},  {23358, 16281, 11329, 0, 0},
+           {25232, 19154, 14239, 0, 0}, {27720, 23182, 19219, 0, 0},
+           {11678, 5478, 3012, 0, 0},   {11972, 5366, 2742, 0, 0},
+           {14949, 7283, 3799, 0, 0},   {18908, 10859, 6306, 0, 0},
+           {21766, 14274, 9239, 0, 0},  {23815, 16839, 11871, 0, 0},
+           {26320, 20850, 16314, 0, 0}}},
+         {{{16769, 10560, 7319, 0, 0},  {19718, 12780, 8646, 0, 0},
+           {24174, 17904, 13390, 0, 0}, {26735, 21689, 17530, 0, 0},
+           {28214, 24085, 20421, 0, 0}, {29096, 25629, 22431, 0, 0},
+           {30868, 28997, 27192, 0, 0}, {16980, 11428, 8819, 0, 0},
+           {15943, 8533, 5010, 0, 0},   {19895, 12366, 7958, 0, 0},
+           {23178, 16405, 11674, 0, 0}, {25416, 19559, 15035, 0, 0},
+           {26808, 21779, 17584, 0, 0}, {29536, 26534, 23761, 0, 0},
+           {17007, 12052, 9544, 0, 0},  {13450, 6779, 4009, 0, 0},
+           {17239, 9674, 5839, 0, 0},   {21106, 13779, 9127, 0, 0},
+           {23813, 17200, 12402, 0, 0}, {25487, 19662, 15060, 0, 0},
+           {28520, 24709, 21328, 0, 0}},
+          {{17869, 11551, 8265, 0, 0},  {19249, 12485, 8721, 0, 0},
+           {23339, 16802, 12403, 0, 0}, {26068, 20413, 16116, 0, 0},
+           {27680, 23064, 19052, 0, 0}, {28525, 24614, 21037, 0, 0},
+           {30066, 27404, 24907, 0, 0}, {10023, 4380, 2314, 0, 0},
+           {12533, 5622, 2846, 0, 0},   {16872, 9053, 5131, 0, 0},
+           {20928, 13418, 8637, 0, 0},  {23646, 16836, 11888, 0, 0},
+           {25280, 19187, 14406, 0, 0}, {27654, 23200, 19398, 0, 0},
+           {11923, 6215, 3836, 0, 0},   {11787, 5396, 2884, 0, 0},
+           {14987, 7433, 3983, 0, 0},   {19008, 11060, 6471, 0, 0},
+           {21793, 14353, 9403, 0, 0},  {23723, 16979, 12082, 0, 0},
+           {26638, 21569, 17345, 0, 0}}},
+         {{{19219, 13044, 9610, 0, 0},  {20924, 14386, 10522, 0, 0},
+           {24849, 19149, 14995, 0, 0}, {27282, 22625, 18822, 0, 0},
+           {28602, 24785, 21444, 0, 0}, {29404, 26262, 23341, 0, 0},
+           {31170, 29608, 28094, 0, 0}, {17487, 11789, 8987, 0, 0},
+           {17829, 10649, 6816, 0, 0},  {21405, 14361, 9956, 0, 0},
+           {24159, 17911, 13398, 0, 0}, {26031, 20584, 16288, 0, 0},
+           {27262, 22505, 18506, 0, 0}, {29778, 26982, 24388, 0, 0},
+           {12519, 7515, 5351, 0, 0},   {11698, 5250, 2767, 0, 0},
+           {15914, 8299, 4694, 0, 0},   {19904, 12282, 7768, 0, 0},
+           {22806, 15790, 10990, 0, 0}, {24694, 18430, 13720, 0, 0},
+           {28274, 24289, 20862, 0, 0}},
+          {{18808, 13151, 9939, 0, 0},  {21618, 15427, 11540, 0, 0},
+           {25618, 19804, 15578, 0, 0}, {27437, 22766, 18901, 0, 0},
+           {28601, 25024, 21711, 0, 0}, {29288, 26139, 23122, 0, 0},
+           {30885, 28984, 27082, 0, 0}, {14016, 7108, 3856, 0, 0},
+           {15800, 8182, 4738, 0, 0},   {19248, 11713, 7455, 0, 0},
+           {22315, 15142, 10488, 0, 0}, {24382, 18263, 13652, 0, 0},
+           {26026, 20173, 15760, 0, 0}, {28495, 24628, 21269, 0, 0},
+           {10648, 4941, 2535, 0, 0},   {12205, 5410, 2873, 0, 0},
+           {15692, 8124, 4615, 0, 0},   {19406, 11826, 7459, 0, 0},
+           {21974, 14803, 10073, 0, 0}, {23754, 17116, 12449, 0, 0},
+           {27060, 22256, 18271, 0, 0}}},
+         {{{27063, 21838, 17043, 0, 0}, {24822, 20003, 16653, 0, 0},
+           {25967, 20645, 16542, 0, 0}, {27306, 22633, 18568, 0, 0},
+           {28579, 24757, 21261, 0, 0}, {29577, 26539, 23360, 0, 0},
+           {31711, 30631, 29556, 0, 0}, {22750, 15701, 11277, 0, 0},
+           {25388, 20186, 16315, 0, 0}, {26700, 21923, 18429, 0, 0},
+           {27670, 23570, 20213, 0, 0}, {28456, 24758, 21649, 0, 0},
+           {29068, 25802, 22987, 0, 0}, {31075, 29442, 27881, 0, 0},
+           {14011, 7838, 4994, 0, 0},   {15120, 8172, 4951, 0, 0},
+           {18061, 10716, 6742, 0, 0},  {21048, 13916, 9476, 0, 0},
+           {23411, 16816, 12243, 0, 0}, {24958, 19015, 14558, 0, 0},
+           {28889, 25435, 22440, 0, 0}},
+          {{24490, 19526, 16846, 0, 0}, {22221, 16901, 13849, 0, 0},
+           {23662, 16926, 12159, 0, 0}, {25935, 19761, 15550, 0, 0},
+           {27957, 23056, 18845, 0, 0}, {28783, 25416, 21640, 0, 0},
+           {31080, 29310, 27506, 0, 0}, {19817, 10907, 6258, 0, 0},
+           {22980, 16724, 12492, 0, 0}, {26459, 21524, 17898, 0, 0},
+           {27585, 23419, 20202, 0, 0}, {28379, 24539, 21276, 0, 0},
+           {29135, 25823, 22148, 0, 0}, {29168, 25921, 22861, 0, 0},
+           {11020, 4631, 2513, 0, 0},   {13332, 6187, 3208, 0, 0},
+           {16409, 8567, 4815, 0, 0},   {18807, 11075, 6897, 0, 0},
+           {21224, 14082, 9446, 0, 0},  {23396, 16306, 11816, 0, 0},
+           {26630, 21558, 17378, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{16630, 10545, 7259, 0, 0},  {17421, 10338, 6436, 0, 0},
+           {23154, 16032, 11436, 0, 0}, {26168, 20493, 15861, 0, 0},
+           {27957, 23344, 19221, 0, 0}, {29020, 24959, 21348, 0, 0},
+           {30514, 28181, 25878, 0, 0}, {17572, 12484, 9591, 0, 0},
+           {14451, 7299, 4317, 0, 0},   {18850, 11117, 6926, 0, 0},
+           {22716, 15618, 10773, 0, 0}, {25269, 19138, 14181, 0, 0},
+           {26610, 21351, 16765, 0, 0}, {28754, 24983, 21516, 0, 0},
+           {17720, 11701, 8384, 0, 0},  {14566, 7422, 4215, 0, 0},
+           {18466, 10749, 6412, 0, 0},  {21929, 14629, 9602, 0, 0},
+           {24053, 17024, 11962, 0, 0}, {25232, 19192, 14224, 0, 0},
+           {27355, 22433, 18270, 0, 0}},
+          {{15374, 8267, 4873, 0, 0},   {16879, 9348, 5583, 0, 0},
+           {21207, 13635, 8898, 0, 0},  {24483, 17956, 12924, 0, 0},
+           {26272, 20725, 16218, 0, 0}, {27997, 23194, 19091, 0, 0},
+           {29165, 25938, 22624, 0, 0}, {11112, 5064, 2568, 0, 0},
+           {11444, 4853, 2257, 0, 0},   {15441, 7432, 3771, 0, 0},
+           {19351, 11387, 6735, 0, 0},  {22636, 15343, 10430, 0, 0},
+           {24188, 17752, 13135, 0, 0}, {27074, 21291, 16357, 0, 0},
+           {8652, 2988, 1318, 0, 0},    {8915, 3073, 1177, 0, 0},
+           {12683, 5154, 2340, 0, 0},   {17442, 8433, 4193, 0, 0},
+           {20954, 13296, 7958, 0, 0},  {22547, 14157, 8001, 0, 0},
+           {25079, 18210, 12447, 0, 0}}},
+         {{{16554, 10388, 6998, 0, 0},  {18555, 11464, 7473, 0, 0},
+           {23555, 16945, 12313, 0, 0}, {26373, 21010, 16629, 0, 0},
+           {27989, 23581, 19702, 0, 0}, {28947, 25267, 21815, 0, 0},
+           {30475, 28201, 25973, 0, 0}, {16909, 11485, 8948, 0, 0},
+           {14364, 7166, 4042, 0, 0},   {18443, 10788, 6562, 0, 0},
+           {22099, 14831, 10048, 0, 0}, {24471, 18126, 13321, 0, 0},
+           {26022, 20379, 15875, 0, 0}, {28444, 24517, 20998, 0, 0},
+           {16236, 11137, 8293, 0, 0},  {12101, 5618, 3100, 0, 0},
+           {16040, 8258, 4593, 0, 0},   {19907, 12123, 7436, 0, 0},
+           {22692, 15407, 10351, 0, 0}, {24373, 17828, 12805, 0, 0},
+           {27037, 22085, 17856, 0, 0}},
+          {{18335, 11613, 7830, 0, 0},  {18110, 11052, 7223, 0, 0},
+           {22845, 15944, 11211, 0, 0}, {25786, 19716, 15047, 0, 0},
+           {27349, 22265, 17718, 0, 0}, {27916, 23606, 19754, 0, 0},
+           {29497, 26373, 23138, 0, 0}, {10558, 4935, 2659, 0, 0},
+           {12018, 5400, 2947, 0, 0},   {15874, 7940, 4195, 0, 0},
+           {19521, 11492, 7011, 0, 0},  {22730, 15503, 10205, 0, 0},
+           {24181, 17821, 12441, 0, 0}, {27123, 21397, 17516, 0, 0},
+           {10741, 5242, 3054, 0, 0},   {9670, 3622, 1547, 0, 0},
+           {12882, 5427, 2496, 0, 0},   {17159, 9021, 4722, 0, 0},
+           {20775, 12703, 7829, 0, 0},  {23131, 14501, 9097, 0, 0},
+           {25143, 18967, 13624, 0, 0}}},
+         {{{18330, 11970, 8679, 0, 0},  {20147, 13565, 9671, 0, 0},
+           {24591, 18643, 14366, 0, 0}, {27094, 22267, 18312, 0, 0},
+           {28532, 24529, 21035, 0, 0}, {29321, 26018, 22962, 0, 0},
+           {30782, 28818, 26904, 0, 0}, {16560, 10669, 7838, 0, 0},
+           {16231, 8743, 5183, 0, 0},   {19988, 12387, 7901, 0, 0},
+           {23001, 16156, 11352, 0, 0}, {25082, 19030, 14370, 0, 0},
+           {26435, 21154, 16804, 0, 0}, {28827, 25197, 21932, 0, 0},
+           {9949, 5346, 3566, 0, 0},    {10544, 4254, 2047, 0, 0},
+           {15108, 7335, 3855, 0, 0},   {19194, 11286, 6766, 0, 0},
+           {22139, 14791, 9830, 0, 0},  {24156, 17470, 12503, 0, 0},
+           {27161, 22277, 18172, 0, 0}},
+          {{19199, 12968, 9562, 0, 0},  {19640, 12844, 8899, 0, 0},
+           {24439, 17927, 13365, 0, 0}, {26638, 21792, 17711, 0, 0},
+           {28086, 23929, 20250, 0, 0}, {29112, 25359, 22180, 0, 0},
+           {30191, 27669, 25356, 0, 0}, {10341, 4084, 2183, 0, 0},
+           {11855, 5018, 2629, 0, 0},   {16928, 8659, 4934, 0, 0},
+           {20460, 12739, 8199, 0, 0},  {22552, 15983, 11310, 0, 0},
+           {24459, 18565, 13655, 0, 0}, {26725, 21600, 17461, 0, 0},
+           {9602, 3867, 1770, 0, 0},    {10869, 4363, 2017, 0, 0},
+           {14355, 6677, 3325, 0, 0},   {17535, 9654, 5416, 0, 0},
+           {20085, 12296, 7480, 0, 0},  {22066, 14509, 9359, 0, 0},
+           {24643, 18304, 13542, 0, 0}}},
+         {{{23728, 17982, 14408, 0, 0}, {22789, 17050, 13353, 0, 0},
+           {24855, 18850, 14457, 0, 0}, {26909, 21879, 17584, 0, 0},
+           {28175, 24091, 20258, 0, 0}, {28948, 25372, 21977, 0, 0},
+           {31038, 29297, 27576, 0, 0}, {20965, 14403, 10059, 0, 0},
+           {21349, 14710, 10543, 0, 0}, {23350, 16994, 12525, 0, 0},
+           {25229, 19443, 15111, 0, 0}, {26535, 21451, 17384, 0, 0},
+           {27631, 23112, 19223, 0, 0}, {29791, 26994, 24419, 0, 0},
+           {11561, 5522, 3128, 0, 0},   {13221, 6190, 3271, 0, 0},
+           {16599, 8897, 5078, 0, 0},   {19948, 12310, 7750, 0, 0},
+           {22544, 15436, 10554, 0, 0}, {24242, 17720, 12884, 0, 0},
+           {27731, 23358, 19650, 0, 0}},
+          {{20429, 15439, 12628, 0, 0}, {19263, 12873, 9543, 0, 0},
+           {22921, 15824, 11204, 0, 0}, {25488, 19512, 14420, 0, 0},
+           {28056, 22759, 18314, 0, 0}, {28407, 24854, 20291, 0, 0},
+           {29898, 27140, 24773, 0, 0}, {12707, 7264, 4242, 0, 0},
+           {17533, 9890, 6623, 0, 0},   {19783, 12810, 8613, 0, 0},
+           {22986, 16127, 11365, 0, 0}, {23312, 16408, 12008, 0, 0},
+           {25913, 19828, 14211, 0, 0}, {27107, 22204, 17766, 0, 0},
+           {7112, 2166, 874, 0, 0},     {10198, 3661, 1676, 0, 0},
+           {13851, 6345, 3227, 0, 0},   {16828, 9119, 5014, 0, 0},
+           {19965, 12187, 7549, 0, 0},  {21686, 14073, 9392, 0, 0},
+           {24829, 18395, 13763, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{14453, 8479, 5217, 0, 0},   {15914, 8700, 4933, 0, 0},
+           {22628, 14841, 9595, 0, 0},  {26046, 19786, 14501, 0, 0},
+           {28107, 22942, 18062, 0, 0}, {28936, 24603, 20474, 0, 0},
+           {29973, 26670, 23523, 0, 0}, {15623, 9442, 6096, 0, 0},
+           {12035, 5088, 2460, 0, 0},   {16736, 8307, 4222, 0, 0},
+           {21115, 12675, 7687, 0, 0},  {23478, 16339, 10682, 0, 0},
+           {24972, 18170, 12786, 0, 0}, {26266, 20390, 15327, 0, 0},
+           {11087, 5036, 2448, 0, 0},   {10379, 3724, 1507, 0, 0},
+           {13741, 6037, 2681, 0, 0},   {18029, 9013, 4144, 0, 0},
+           {21410, 11990, 7257, 0, 0},  {21773, 14695, 8578, 0, 0},
+           {23606, 17778, 12151, 0, 0}},
+          {{11343, 4816, 2380, 0, 0},   {14706, 6930, 3734, 0, 0},
+           {20812, 12887, 7960, 0, 0},  {25050, 17768, 11788, 0, 0},
+           {27066, 21514, 16625, 0, 0}, {27870, 23680, 15904, 0, 0},
+           {29089, 25992, 20861, 0, 0}, {9474, 2608, 1105, 0, 0},
+           {8371, 2872, 932, 0, 0},     {13523, 5640, 2175, 0, 0},
+           {19566, 12943, 6364, 0, 0},  {21190, 13471, 8811, 0, 0},
+           {24695, 19471, 11398, 0, 0}, {27307, 21845, 13023, 0, 0},
+           {5401, 2247, 834, 0, 0},     {7864, 2097, 828, 0, 0},
+           {9693, 4308, 1469, 0, 0},    {18368, 9110, 2351, 0, 0},
+           {18883, 8886, 4443, 0, 0},   {18022, 9830, 4915, 0, 0},
+           {27307, 16384, 5461, 0, 0}}},
+         {{{14494, 7955, 4878, 0, 0},   {17231, 9619, 5765, 0, 0},
+           {23319, 16028, 10941, 0, 0}, {26068, 20270, 15507, 0, 0},
+           {27780, 22902, 18570, 0, 0}, {28532, 24621, 20866, 0, 0},
+           {29901, 26908, 24114, 0, 0}, {15644, 9597, 6667, 0, 0},
+           {12372, 5291, 2620, 0, 0},   {16195, 8139, 4276, 0, 0},
+           {20019, 11922, 7094, 0, 0},  {22535, 14890, 9950, 0, 0},
+           {24243, 17436, 12405, 0, 0}, {26485, 21136, 16513, 0, 0},
+           {12302, 6257, 3482, 0, 0},   {9709, 3594, 1577, 0, 0},
+           {13287, 5505, 2527, 0, 0},   {17310, 9137, 4631, 0, 0},
+           {20352, 12160, 7075, 0, 0},  {22507, 14757, 9507, 0, 0},
+           {24752, 18113, 13102, 0, 0}},
+          {{15152, 8182, 4656, 0, 0},   {16959, 9469, 5613, 0, 0},
+           {22001, 13878, 8975, 0, 0},  {25041, 18513, 13903, 0, 0},
+           {26639, 20842, 15886, 0, 0}, {28286, 23064, 17907, 0, 0},
+           {29491, 25316, 21246, 0, 0}, {9812, 4217, 2038, 0, 0},
+           {10044, 3831, 1807, 0, 0},   {14301, 6444, 3188, 0, 0},
+           {19534, 12055, 7119, 0, 0},  {21587, 15176, 10287, 0, 0},
+           {24477, 14410, 8192, 0, 0},  {25200, 20887, 17784, 0, 0},
+           {7820, 3767, 1621, 0, 0},    {7094, 2149, 617, 0, 0},
+           {11927, 5975, 3165, 0, 0},   {18099, 8412, 4102, 0, 0},
+           {21434, 9175, 4549, 0, 0},   {23846, 18006, 9895, 0, 0},
+           {24467, 19224, 12233, 0, 0}}},
+         {{{15655, 9035, 5687, 0, 0},   {18629, 11362, 7316, 0, 0},
+           {24216, 17766, 12992, 0, 0}, {26897, 21648, 17390, 0, 0},
+           {28313, 24152, 20515, 0, 0}, {29299, 25858, 22382, 0, 0},
+           {30513, 28215, 25986, 0, 0}, {14544, 8392, 5715, 0, 0},
+           {13478, 6058, 3154, 0, 0},   {17832, 9777, 5584, 0, 0},
+           {21530, 13817, 9006, 0, 0},  {23982, 17151, 12180, 0, 0},
+           {25451, 19540, 14765, 0, 0}, {27667, 23256, 19275, 0, 0},
+           {10129, 4546, 2558, 0, 0},   {9552, 3437, 1461, 0, 0},
+           {13693, 6006, 2873, 0, 0},   {17754, 9655, 5311, 0, 0},
+           {20830, 12911, 8016, 0, 0},  {22826, 15488, 10486, 0, 0},
+           {25601, 19624, 15016, 0, 0}},
+          {{16948, 10030, 6280, 0, 0},  {19238, 11883, 7552, 0, 0},
+           {24373, 17238, 12316, 0, 0}, {26194, 20447, 16388, 0, 0},
+           {27415, 22349, 18200, 0, 0}, {28155, 24322, 20387, 0, 0},
+           {29328, 25610, 22865, 0, 0}, {8521, 3717, 1544, 0, 0},
+           {10650, 4710, 2399, 0, 0},   {16270, 8000, 4379, 0, 0},
+           {19848, 11593, 6631, 0, 0},  {22038, 14149, 7416, 0, 0},
+           {22581, 16489, 9977, 0, 0},  {23458, 18137, 10641, 0, 0},
+           {7798, 2210, 711, 0, 0},     {7967, 2826, 1070, 0, 0},
+           {10336, 4315, 1913, 0, 0},   {13714, 7088, 3188, 0, 0},
+           {18376, 9732, 4659, 0, 0},   {20273, 11821, 6118, 0, 0},
+           {20326, 12442, 6554, 0, 0}}},
+         {{{20606, 13983, 10120, 0, 0}, {20019, 13071, 8962, 0, 0},
+           {24188, 17471, 12422, 0, 0}, {26599, 21019, 16225, 0, 0},
+           {27932, 23377, 19320, 0, 0}, {28947, 25057, 21155, 0, 0},
+           {30540, 28167, 25698, 0, 0}, {16449, 8043, 4488, 0, 0},
+           {17070, 9491, 5600, 0, 0},   {20042, 12400, 7721, 0, 0},
+           {22856, 15753, 10792, 0, 0}, {24880, 18548, 13589, 0, 0},
+           {25991, 20484, 15750, 0, 0}, {28276, 24178, 20516, 0, 0},
+           {9519, 3864, 1821, 0, 0},    {11718, 4860, 2256, 0, 0},
+           {15328, 7428, 3819, 0, 0},   {18709, 10750, 6227, 0, 0},
+           {21480, 13865, 8870, 0, 0},  {23357, 16426, 11340, 0, 0},
+           {26490, 21180, 16824, 0, 0}},
+          {{18787, 12701, 9542, 0, 0},  {15846, 9188, 5985, 0, 0},
+           {21763, 13729, 8281, 0, 0},  {25379, 18550, 12970, 0, 0},
+           {27170, 21263, 15562, 0, 0}, {26678, 21555, 17109, 0, 0},
+           {28948, 25397, 22649, 0, 0}, {11686, 5843, 3093, 0, 0},
+           {11506, 4141, 1640, 0, 0},   {14376, 6314, 2331, 0, 0},
+           {17898, 9858, 5672, 0, 0},   {20148, 13284, 7860, 0, 0},
+           {23478, 16215, 9966, 0, 0},  {26100, 18480, 12764, 0, 0},
+           {5064, 1713, 819, 0, 0},     {8059, 2790, 980, 0, 0},
+           {11100, 3504, 1111, 0, 0},   {14473, 5800, 2694, 0, 0},
+           {16369, 8346, 3455, 0, 0},   {18421, 9742, 4664, 0, 0},
+           {20398, 12962, 8291, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                                [kDcSignContexts][kBooleanFieldCdfSize] = {
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}}
+};
+/* clang-format on */
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] = {23355, 10187,
+                                                                   0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultHasPaletteYCdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+                          [kBooleanFieldCdfSize] = {
+                              {{1092, 0, 0}, {29349, 0, 0}, {31507, 0, 0}},
+                              {{856, 0, 0}, {29909, 0, 0}, {31788, 0, 0}},
+                              {{945, 0, 0}, {29368, 0, 0}, {31987, 0, 0}},
+                              {{738, 0, 0}, {29207, 0, 0}, {31864, 0, 0}},
+                              {{459, 0, 0}, {25431, 0, 0}, {31306, 0, 0}},
+                              {{503, 0, 0}, {28753, 0, 0}, {31247, 0, 0}},
+                              {{318, 0, 0}, {24822, 0, 0}, {32639, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultPaletteYSizeCdf[kPaletteBlockSizeContexts]
+                           [kPaletteSizeSymbolCount + 1] = {
+                               {24816, 19768, 14619, 11290, 7241, 3527, 0, 0},
+                               {25629, 21347, 16573, 13224, 9102, 4695, 0, 0},
+                               {24980, 20027, 15443, 12268, 8453, 4238, 0, 0},
+                               {24497, 18704, 14522, 11204, 7697, 4235, 0, 0},
+                               {20043, 13588, 10905, 7929, 5233, 2648, 0, 0},
+                               {23057, 17880, 15845, 11716, 7107, 4893, 0, 0},
+                               {17828, 11971, 11090, 8582, 5735, 3769, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultHasPaletteUVCdf[kPaletteUVModeContexts][kBooleanFieldCdfSize] = {
+        {307, 0, 0}, {11280, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultPaletteUVSizeCdf[kPaletteBlockSizeContexts]
+                            [kPaletteSizeSymbolCount + 1] = {
+                                {24055, 12789, 5640, 3159, 1437, 496, 0, 0},
+                                {26929, 17195, 9187, 5821, 2920, 1068, 0, 0},
+                                {28342, 21508, 14769, 11285, 6905, 3338, 0, 0},
+                                {29540, 23304, 17775, 14679, 10245, 5348, 0, 0},
+                                {29000, 23882, 19677, 14916, 10273, 5561, 0, 0},
+                                {30304, 24317, 19907, 11136, 7243, 4213, 0, 0},
+                                {31499, 27333, 22335, 13805, 11068, 6903, 0,
+                                 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPaletteColorIndexCdf
+    [kNumPlaneTypes][kPaletteSizeSymbolCount][kPaletteColorIndexContexts]
+    [kPaletteColorIndexSymbolCount + 1] = {
+        {{{4058, 0, 0},
+          {16384, 0, 0},
+          {22215, 0, 0},
+          {5732, 0, 0},
+          {1165, 0, 0}},
+         {{4891, 2278, 0, 0},
+          {21236, 7071, 0, 0},
+          {26224, 2534, 0, 0},
+          {9750, 4696, 0, 0},
+          {853, 383, 0, 0}},
+         {{7196, 4722, 2723, 0, 0},
+          {23290, 11178, 5512, 0, 0},
+          {25520, 5931, 2944, 0, 0},
+          {13601, 8282, 4419, 0, 0},
+          {1368, 943, 518, 0, 0}},
+         {{7989, 5813, 4192, 2486, 0, 0},
+          {24099, 12404, 8695, 4675, 0, 0},
+          {28513, 5203, 3391, 1701, 0, 0},
+          {12904, 9094, 6052, 3238, 0, 0},
+          {1122, 875, 621, 342, 0, 0}},
+         {{9636, 7361, 5798, 4333, 2695, 0, 0},
+          {25325, 15526, 12051, 8006, 4786, 0, 0},
+          {26468, 7906, 5824, 3984, 2097, 0, 0},
+          {13852, 9873, 7501, 5333, 3116, 0, 0},
+          {1498, 1218, 960, 709, 415, 0, 0}},
+         {{9663, 7569, 6304, 5084, 3837, 2450, 0, 0},
+          {25818, 17321, 13816, 10087, 7201, 4205, 0, 0},
+          {25208, 9294, 7278, 5565, 3847, 2060, 0, 0},
+          {14224, 10395, 8311, 6573, 4649, 2723, 0, 0},
+          {1570, 1317, 1098, 886, 645, 377, 0, 0}},
+         {{11079, 8885, 7605, 6416, 5262, 3941, 2573, 0, 0},
+          {25876, 17383, 14928, 11162, 8481, 6015, 3564, 0, 0},
+          {27117, 9586, 7726, 6250, 4786, 3376, 1868, 0, 0},
+          {13419, 10190, 8350, 6774, 5244, 3737, 2320, 0, 0},
+          {1740, 1498, 1264, 1063, 841, 615, 376, 0, 0}}},
+        {{{3679, 0, 0},
+          {16384, 0, 0},
+          {24055, 0, 0},
+          {3511, 0, 0},
+          {1158, 0, 0}},
+         {{7511, 3623, 0, 0},
+          {20481, 5475, 0, 0},
+          {25735, 4808, 0, 0},
+          {12623, 7363, 0, 0},
+          {2160, 1129, 0, 0}},
+         {{8558, 5593, 2865, 0, 0},
+          {22880, 10382, 5554, 0, 0},
+          {26867, 6715, 3475, 0, 0},
+          {14450, 10616, 4435, 0, 0},
+          {2309, 1632, 842, 0, 0}},
+         {{9788, 7289, 4987, 2782, 0, 0},
+          {24355, 11360, 7909, 3894, 0, 0},
+          {30511, 3319, 2174, 1170, 0, 0},
+          {13579, 11566, 6853, 4148, 0, 0},
+          {924, 724, 487, 250, 0, 0}},
+         {{10551, 8201, 6131, 4085, 2220, 0, 0},
+          {25461, 16362, 13132, 8136, 4344, 0, 0},
+          {28327, 7704, 5889, 3826, 1849, 0, 0},
+          {15558, 12240, 9449, 6018, 3186, 0, 0},
+          {2094, 1815, 1372, 1033, 561, 0, 0}},
+         {{11529, 9600, 7724, 5806, 4063, 2262, 0, 0},
+          {26223, 17756, 14764, 10951, 7265, 4067, 0, 0},
+          {29320, 6473, 5331, 4064, 2642, 1326, 0, 0},
+          {16879, 14445, 11064, 8070, 5792, 3078, 0, 0},
+          {1780, 1564, 1289, 1034, 785, 443, 0, 0}},
+         {{11326, 9480, 8010, 6522, 5119, 3788, 2205, 0, 0},
+          {26905, 17835, 15216, 12100, 9085, 6357, 3495, 0, 0},
+          {29353, 6958, 5891, 4778, 3545, 2374, 1150, 0, 0},
+          {14803, 12684, 10536, 8794, 6494, 4366, 2378, 0, 0},
+          {1578, 1439, 1252, 1089, 943, 742, 446, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] = {
+        {31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseCompoundReferenceCdf[kUseCompoundReferenceContexts]
+                                   [kBooleanFieldCdfSize] = {{5940, 0, 0},
+                                                             {8733, 0, 0},
+                                                             {20737, 0, 0},
+                                                             {22128, 0, 0},
+                                                             {29867, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundReferenceTypeCdf[kCompoundReferenceTypeContexts]
+                                    [kBooleanFieldCdfSize] = {{31570, 0, 0},
+                                                              {30698, 0, 0},
+                                                              {23602, 0, 0},
+                                                              {25269, 0, 0},
+                                                              {10293, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundReferenceCdf
+    [kNumCompoundReferenceTypes][kReferenceContexts][3][kBooleanFieldCdfSize] =
+        {{{{27484, 0, 0}, {28903, 0, 0}, {29640, 0, 0}},
+          {{9616, 0, 0}, {18595, 0, 0}, {17498, 0, 0}},
+          {{994, 0, 0}, {7648, 0, 0}, {6058, 0, 0}}},
+         {{{27822, 0, 0}, {23300, 0, 0}, {31265, 0, 0}},
+          {{12877, 0, 0}, {10327, 0, 0}, {17608, 0, 0}},
+          {{2037, 0, 0}, {1709, 0, 0}, {5224, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundBackwardReferenceCdf[kReferenceContexts][2]
+                                        [kBooleanFieldCdfSize] = {
+                                            {{30533, 0, 0}, {31345, 0, 0}},
+                                            {{15586, 0, 0}, {17593, 0, 0}},
+                                            {{2162, 0, 0}, {2279, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
+                                         [kBooleanFieldCdfSize] = {
+  {{27871, 0, 0}, {31213, 0, 0}, {28532, 0, 0}, {24118, 0, 0}, {31864, 0, 0},
+   {31324, 0, 0}},
+  {{15795, 0, 0}, {16017, 0, 0}, {13121, 0, 0}, {7995, 0, 0}, {21754, 0, 0},
+   {17681, 0, 0}},
+  {{3024, 0, 0}, {2489, 0, 0}, {1574, 0, 0}, {873, 0, 0}, {5893, 0, 0},
+   {2464, 0, 0}}};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundPredictionModeCdf
+    [kCompoundPredictionModeContexts][kNumCompoundInterPredictionModes + 1] = {
+        {25008, 18945, 16960, 15127, 13612, 12102, 5877, 0, 0},
+        {22038, 13316, 11623, 10019, 8729, 7637, 4044, 0, 0},
+        {22104, 12547, 11180, 9862, 8473, 7381, 4332, 0, 0},
+        {19470, 15784, 12297, 8586, 7701, 7032, 6346, 0, 0},
+        {13864, 9443, 7526, 5336, 4870, 4510, 2010, 0, 0},
+        {22043, 15314, 12644, 9948, 8573, 7600, 6722, 0, 0},
+        {15643, 8495, 6954, 5276, 4554, 4064, 2176, 0, 0},
+        {19722, 9554, 8263, 6826, 5333, 4326, 3438, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
+        {8733, 0, 0},  {16138, 0, 0}, {17429, 0, 0},
+        {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {{30593, 0, 0},
+                                                                {31714, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultReferenceMvCdf[kReferenceMvContexts][kBooleanFieldCdfSize] = {
+        {8794, 0, 0}, {8580, 0, 0}, {14920, 0, 0},
+        {4146, 0, 0}, {8456, 0, 0}, {12845, 0, 0}};
+
+// This is called drl_mode in the spec where DRL stands for Dynamic Reference
+// List.
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultRefMvIndexCdf[kRefMvIndexContexts][kBooleanFieldCdfSize] = {
+        {19664, 0, 0}, {8208, 0, 0}, {13823, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsInterIntraCdf[kInterIntraContexts][kBooleanFieldCdfSize] = {
+        {5881, 0, 0}, {5171, 0, 0}, {2531, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultInterIntraModeCdf[kInterIntraContexts][kNumInterIntraModes + 1] = {
+        {30893, 21686, 5436, 0, 0},
+        {30295, 22772, 6380, 0, 0},
+        {28530, 21231, 6842, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsWedgeInterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {12732, 0, 0}, {7811, 0, 0},  {16384, 0, 0}, {16384, 0, 0},
+        {6064, 0, 0},  {5238, 0, 0},  {3204, 0, 0},  {16384, 0, 0},
+        {16384, 0, 0}, {3324, 0, 0},  {5896, 0, 0},  {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultWedgeIndexCdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1] = {
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30330, 28328, 26169, 24105, 21763, 19894, 17017, 14674, 12409, 10406,
+         8641, 7066, 5016, 3318, 1597, 0, 0},
+        {31962, 29502, 26763, 26030, 25550, 25401, 24997, 18180, 16445, 15401,
+         14316, 13346, 9929, 6641, 3139, 0, 0},
+        {32614, 31781, 30843, 30717, 30680, 30657, 30617, 9735, 9065, 8484,
+         7783, 7084, 5509, 3885, 1857, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {29989, 29030, 28085, 25555, 24993, 24751, 24113, 18411, 14829, 11436,
+         8248, 5298, 3312, 2239, 1112, 0, 0},
+        {31084, 29143, 27093, 25660, 23466, 21494, 18339, 15624, 13605, 11807,
+         9884, 8297, 6049, 4054, 1891, 0, 0},
+        {31626, 29277, 26491, 25454, 24679, 24413, 23745, 19144, 17399, 16038,
+         14654, 13455, 10247, 6756, 3218, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {31633, 31446, 31275, 30133, 30072, 30031, 29998, 11752, 9833, 7711,
+         5517, 3595, 2679, 1808, 835, 0, 0},
+        {30026, 28573, 27041, 24733, 23788, 23432, 22622, 18644, 15498, 12235,
+         9334, 6796, 4824, 3198, 1352, 0, 0},
+        {31041, 28820, 26667, 24972, 22927, 20424, 17002, 13824, 12130, 10730,
+         8805, 7457, 5780, 4002, 1756, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {22331, 0, 0}, {23397, 0, 0}, {9104, 0, 0},  {16384, 0, 0},
+        {23467, 0, 0}, {15336, 0, 0}, {18345, 0, 0}, {8760, 0, 0},
+        {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0},  {9945, 0, 0},
+        {5889, 0, 0},  {10685, 0, 0}, {2640, 0, 0},  {1754, 0, 0},
+        {1208, 0, 0},  {130, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] = {
+        {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+        {21845, 10923, 0, 0}, {25117, 8008, 0, 0},  {28030, 8003, 0, 0},
+        {3969, 1378, 0, 0},   {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
+        {13349, 5958, 0, 0},  {27645, 9162, 0, 0},  {3795, 1174, 0, 0},
+        {6337, 1994, 0, 0},   {21162, 8460, 0, 0},  {6508, 3652, 0, 0},
+        {12408, 4706, 0, 0},  {3026, 1565, 0, 0},   {11089, 5938, 0, 0},
+        {3252, 2067, 0, 0},   {3870, 2371, 0, 0},   {1890, 1433, 0, 0},
+        {261, 210, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsExplicitCompoundTypeCdf[kIsExplicitCompoundTypeContexts]
+                                     [kBooleanFieldCdfSize] = {
+                                         {6161, 0, 0},  {9877, 0, 0},
+                                         {13928, 0, 0}, {8174, 0, 0},
+                                         {12834, 0, 0}, {10094, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsCompoundTypeAverageCdf[kIsCompoundTypeAverageContexts]
+                                    [kBooleanFieldCdfSize] = {
+                                        {14524, 0, 0}, {19903, 0, 0},
+                                        {25715, 0, 0}, {19509, 0, 0},
+                                        {23434, 0, 0}, {28124, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundTypeCdf[kMaxBlockSizes]
+                           [kNumExplicitCompoundPredictionTypes + 1] = {
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}, {9337, 0, 0},  {19597, 0, 0},
+                               {20948, 0, 0}, {16384, 0, 0}, {21298, 0, 0},
+                               {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
+                               {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0},
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultInterpolationFilterCdf
+    [kInterpolationFilterContexts][kNumExplicitInterpolationFilters + 1] = {
+        {833, 48, 0, 0},      {27200, 49, 0, 0},    {32346, 29830, 0, 0},
+        {4524, 160, 0, 0},    {1562, 815, 0, 0},    {27906, 647, 0, 0},
+        {31998, 31616, 0, 0}, {11879, 7131, 0, 0},  {858, 44, 0, 0},
+        {28648, 56, 0, 0},    {32463, 30521, 0, 0}, {5365, 132, 0, 0},
+        {1746, 759, 0, 0},    {29805, 675, 0, 0},   {32167, 31825, 0, 0},
+        {17799, 11370, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504, 13440, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
+        4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0FractionCdf[kBooleanSymbolCount][kMvFractionSymbolCount +
+                                                     1] = {
+        {16384, 8192, 6144, 0, 0}, {20480, 11520, 8640, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {12288, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
+        {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0},
+        {10240, 0, 0}, {8192, 0, 0},  {4096, 0, 0},  {2816, 0, 0},
+        {2816, 0, 0},  {2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {24576, 15360, 11520, 0,
+                                                         0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc
new file mode 100644
index 0000000..cd4d576
--- /dev/null
+++ b/src/threading_strategy.cc
@@ -0,0 +1,222 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/threading_strategy.h"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+namespace {
+
+#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
+constexpr int kFrameParallelThresholdMultiplier = 3;
+#else
+constexpr int kFrameParallelThresholdMultiplier =
+    LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
+#endif
+
+// Computes the number of frame threads to be used based on the following
+// heuristic:
+//   * If |thread_count| == 1, return 0.
+//   * If |thread_count| <= |tile_count| * 4, return 0.
+//   * Otherwise, return the largest value of i which satisfies the following
+//     condition: i + i * tile_columns <= thread_count. This ensures that there
+//     are at least |tile_columns| worker threads for each frame thread.
+//   * This function will never return 1 or a value > |thread_count|.
+//
+//  This heuristic is based empirical performance data. The in-frame threading
+//  model (combination of tile multithreading, superblock row multithreading and
+//  post filter multithreading) performs better than the frame parallel model
+//  until we reach the threshold of |thread_count| > |tile_count| *
+//  kFrameParallelThresholdMultiplier.
+//
+//  It is a function of |tile_count| since tile threading and superblock row
+//  multithreading will scale only as a factor of |tile_count|. The threshold 4
+//  is arrived at based on empirical data. The general idea is that superblock
+//  row multithreading plateaus at 4 * |tile_count| because in most practical
+//  cases there aren't more than that many superblock rows and columns available
+//  to work on in parallel.
+int ComputeFrameThreadCount(int thread_count, int tile_count,
+                            int tile_columns) {
+  assert(thread_count > 0);
+  if (thread_count == 1) return 0;
+  return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
+             ? 0
+             : std::max(2, thread_count / (1 + tile_columns));
+}
+
+}  // namespace
+
+bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
+                              int thread_count) {
+  assert(thread_count > 0);
+  frame_parallel_ = false;
+
+  if (thread_count == 1) {
+    thread_pool_.reset(nullptr);
+    tile_thread_count_ = 0;
+    max_tile_index_for_row_threads_ = 0;
+    return true;
+  }
+
+  // We do work in the current thread, so it is sufficient to create
+  // |thread_count|-1 threads in the threadpool.
+  thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
+
+  if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+    thread_pool_ = ThreadPool::Create("libgav1", thread_count);
+    if (thread_pool_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+                   thread_count);
+      tile_thread_count_ = 0;
+      max_tile_index_for_row_threads_ = 0;
+      return false;
+    }
+  }
+
+  // Prefer tile threads first (but only if there is more than one tile).
+  const int tile_count = frame_header.tile_info.tile_count;
+  if (tile_count > 1) {
+    // We want 1 + tile_thread_count_ <= tile_count because the current thread
+    // is also used to decode tiles. This is equivalent to
+    // tile_thread_count_ <= tile_count - 1.
+    tile_thread_count_ = std::min(thread_count, tile_count - 1);
+    thread_count -= tile_thread_count_;
+    if (thread_count == 0) {
+      max_tile_index_for_row_threads_ = 0;
+      return true;
+    }
+  } else {
+    tile_thread_count_ = 0;
+  }
+
+#if defined(__ANDROID__)
+  // Assign the remaining threads for each Tile. The heuristic used here is that
+  // we will assign two threads for each Tile. So for example, if |thread_count|
+  // is 2, for a stream with 2 tiles the first tile would get both the threads
+  // and the second tile would have row multi-threading turned off. This
+  // heuristic is based on the fact that row multi-threading is fast enough only
+  // when there are at least two threads to do the decoding (since one thread
+  // always does the parsing).
+  //
+  // This heuristic might stop working when SIMD optimizations make the decoding
+  // much faster and the parsing thread is only as fast as the decoding threads.
+  // So we will have to revisit this later to make sure that this is still
+  // optimal.
+  //
+  // Note that while this heuristic significantly improves performance on high
+  // end devices (like the Pixel 3), there are some performance regressions in
+  // some lower end devices (in some cases) and that needs to be revisited as we
+  // bring in more optimizations. Overall, the gains because of this heuristic
+  // seems to be much larger than the regressions.
+  for (int i = 0; i < tile_count; ++i) {
+    max_tile_index_for_row_threads_ = i + 1;
+    thread_count -= 2;
+    if (thread_count <= 0) break;
+  }
+#else  // !defined(__ANDROID__)
+  // Assign the remaining threads to each Tile.
+  for (int i = 0; i < tile_count; ++i) {
+    const int count = thread_count / tile_count +
+                      static_cast<int>(i < thread_count % tile_count);
+    if (count == 0) {
+      // Once we see a 0 value, all subsequent values will be 0 since it is
+      // supposed to be assigned in a round-robin fashion.
+      break;
+    }
+    max_tile_index_for_row_threads_ = i + 1;
+  }
+#endif  // defined(__ANDROID__)
+  return true;
+}
+
+bool ThreadingStrategy::Reset(int thread_count) {
+  assert(thread_count > 0);
+  frame_parallel_ = true;
+
+  // In frame parallel mode, we simply access the underlying |thread_pool_|
+  // directly. So ensure all the other threadpool getter functions return
+  // nullptr. Also, superblock row multithreading is always disabled in frame
+  // parallel mode.
+  tile_thread_count_ = 0;
+  max_tile_index_for_row_threads_ = 0;
+
+  if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+    thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
+    if (thread_pool_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+                   thread_count);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool InitializeThreadPoolsForFrameParallel(
+    int thread_count, int tile_count, int tile_columns,
+    std::unique_ptr<ThreadPool>* const frame_thread_pool,
+    FrameScratchBufferPool* const frame_scratch_buffer_pool) {
+  assert(*frame_thread_pool == nullptr);
+  thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
+  const int frame_threads =
+      ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
+  if (frame_threads == 0) return true;
+  *frame_thread_pool = ThreadPool::Create(frame_threads);
+  if (*frame_thread_pool == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
+                 frame_threads);
+    return false;
+  }
+  int remaining_threads = thread_count - frame_threads;
+  if (remaining_threads == 0) return true;
+  int threads_per_frame = remaining_threads / frame_threads;
+  const int extra_threads = remaining_threads % frame_threads;
+  Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+  if (!frame_scratch_buffers.reserve(frame_threads)) return false;
+  // Create the tile thread pools.
+  for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
+    std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+        frame_scratch_buffer_pool->Get();
+    if (frame_scratch_buffer == nullptr) {
+      return false;
+    }
+    // If the number of tile threads cannot be divided equally amongst all the
+    // frame threads, assign one extra thread to the first |extra_threads| frame
+    // threads.
+    const int current_frame_thread_count =
+        threads_per_frame + static_cast<int>(i < extra_threads);
+    if (!frame_scratch_buffer->threading_strategy.Reset(
+            current_frame_thread_count)) {
+      return false;
+    }
+    remaining_threads -= current_frame_thread_count;
+    frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
+  }
+  // We release the frame scratch buffers in reverse order so that the extra
+  // threads are allocated to buffers in the top of the stack.
+  for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
+       --i) {
+    frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/threading_strategy.h b/src/threading_strategy.h
new file mode 100644
index 0000000..84b3589
--- /dev/null
+++ b/src/threading_strategy.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_THREADING_STRATEGY_H_
+#define LIBGAV1_SRC_THREADING_STRATEGY_H_
+
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+class FrameScratchBufferPool;
+
+// This class allocates and manages the worker threads among thread pools used
+// for multi-threaded decoding.
+class ThreadingStrategy {
+ public:
+  ThreadingStrategy() = default;
+
+  // Not copyable or movable.
+  ThreadingStrategy(const ThreadingStrategy&) = delete;
+  ThreadingStrategy& operator=(const ThreadingStrategy&) = delete;
+
+  // Creates or re-allocates the thread pools based on the |frame_header| and
+  // |thread_count|. This function is used only in non frame-parallel mode. This
+  // function is idempotent if the |frame_header| and |thread_count| don't
+  // change between calls (it will only create new threads on the first call and
+  // do nothing on the subsequent calls). This function also starts the worker
+  // threads whenever it creates new thread pools.
+  // The following strategy is used to allocate threads:
+  //   * One thread is allocated for decoding each Tile.
+  //   * Any remaining threads are allocated for superblock row multi-threading
+  //     within each of the tile in a round robin fashion.
+  // Note: During the lifetime of a ThreadingStrategy object, only one of the
+  // Reset() variants will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header,
+                                     int thread_count);
+
+  // Creates or re-allocates a thread pool with |thread_count| threads. This
+  // function is used only in frame parallel mode. This function is idempotent
+  // if the |thread_count| doesn't change between calls (it will only create new
+  // threads on the first call and do nothing on the subsequent calls).
+  // Note: During the lifetime of a ThreadingStrategy object, only one of the
+  // Reset() variants will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count);
+
+  // Returns a pointer to the ThreadPool that is to be used for Tile
+  // multi-threading.
+  ThreadPool* tile_thread_pool() const {
+    return (tile_thread_count_ != 0) ? thread_pool_.get() : nullptr;
+  }
+
+  int tile_thread_count() const { return tile_thread_count_; }
+
+  // Returns a pointer to the underlying ThreadPool.
+  // Note: Valid only when |frame_parallel_| is true. This is used for
+  // facilitating in-frame multi-threading in that case.
+  ThreadPool* thread_pool() const { return thread_pool_.get(); }
+
+  // Returns a pointer to the ThreadPool that is to be used within the Tile at
+  // index |tile_index| for superblock row multi-threading.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* row_thread_pool(int tile_index) const {
+    return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get()
+                                                        : nullptr;
+  }
+
+  // Returns a pointer to the ThreadPool that is to be used for post filter
+  // multi-threading.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* post_filter_thread_pool() const {
+    return frame_parallel_ ? nullptr : thread_pool_.get();
+  }
+
+  // Returns a pointer to the ThreadPool that is to be used for film grain
+  // synthesis and blending.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); }
+
+ private:
+  std::unique_ptr<ThreadPool> thread_pool_;
+  int tile_thread_count_ = 0;
+  int max_tile_index_for_row_threads_ = 0;
+  bool frame_parallel_ = false;
+};
+
+// Initializes the |frame_thread_pool| and the necessary worker threadpools (the
+// threading_strategy objects in each of the frame scratch buffer in
+// |frame_scratch_buffer_pool|) as follows:
+//  * frame_threads = ComputeFrameThreadCount();
+//  * For more details on how frame_threads is computed, see the function
+//    comment in ComputeFrameThreadCount().
+//  * |frame_thread_pool| is created with |frame_threads| threads.
+//  * divide the remaining number of threads into each frame thread and
+//    initialize a frame_scratch_buffer.threading_strategy for each frame
+//    thread.
+//  When this function is called, |frame_scratch_buffer_pool| must be empty. If
+//  this function returns true, it means the initialization was successful and
+//  one of the following is true:
+//    * |frame_thread_pool| has been successfully initialized and
+//      |frame_scratch_buffer_pool| has been successfully populated with
+//      |frame_threads| buffers to be used by each frame thread. The total
+//      number of threads that this function creates will always be equal to
+//      |thread_count|.
+//    * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not
+//      modified. This means that frame threading will not be used and the
+//      decoder will continue to operate normally in non frame parallel mode.
+LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel(
+    int thread_count, int tile_count, int tile_columns,
+    std::unique_ptr<ThreadPool>* frame_thread_pool,
+    FrameScratchBufferPool* frame_scratch_buffer_pool);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_THREADING_STRATEGY_H_
diff --git a/src/tile.h b/src/tile.h
new file mode 100644
index 0000000..73bb5fd
--- /dev/null
+++ b/src/tile.h
@@ -0,0 +1,914 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_H_
+#define LIBGAV1_SRC_TILE_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/parameter_tree.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Indicates what the ProcessSuperBlock() and TransformBlock() functions should
+// do. "Parse" refers to consuming the bitstream, reading the transform
+// coefficients and performing the dequantization. "Decode" refers to computing
+// the prediction, applying the inverse transforms and adding the residual.
+enum ProcessingMode {
+  kProcessingModeParseOnly,
+  kProcessingModeDecodeOnly,
+  kProcessingModeParseAndDecode,
+};
+
+class Tile : public Allocable {
+ public:
+  static std::unique_ptr<Tile> Create(
+      int tile_number, const uint8_t* const data, size_t size,
+      const ObuSequenceHeader& sequence_header,
+      const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame,
+      const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+      const WedgeMaskArray& wedge_masks,
+      const QuantizerMatrix& quantizer_matrix,
+      SymbolDecoderContext* const saved_symbol_decoder_context,
+      const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
+      const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
+      BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+      bool use_intra_prediction_buffer) {
+    std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
+        tile_number, data, size, sequence_header, frame_header, current_frame,
+        state, frame_scratch_buffer, wedge_masks, quantizer_matrix,
+        saved_symbol_decoder_context, prev_segment_ids, post_filter, dsp,
+        thread_pool, pending_tiles, frame_parallel,
+        use_intra_prediction_buffer));
+    return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
+  }
+
+  // Move only.
+  Tile(Tile&& tile) noexcept;
+  Tile& operator=(Tile&& tile) noexcept;
+  Tile(const Tile&) = delete;
+  Tile& operator=(const Tile&) = delete;
+
+  struct Block;  // Defined after this class.
+
+  // Parses the entire tile.
+  bool Parse();
+  // Decodes the entire tile. |superblock_row_progress| and
+  // |superblock_row_progress_condvar| are arrays of size equal to the number of
+  // superblock rows in the frame. Increments |superblock_row_progress[i]| after
+  // each superblock row at index |i| is decoded. If the count reaches the
+  // number of tile columns, then it notifies
+  // |superblock_row_progress_condvar[i]|.
+  bool Decode(std::mutex* mutex, int* superblock_row_progress,
+              std::condition_variable* superblock_row_progress_condvar);
+  // Parses and decodes the entire tile. Depending on the configuration of this
+  // Tile, this function may do multithreaded decoding.
+  bool ParseAndDecode();  // 5.11.2.
+  // Processes all the columns of the superblock row at |row4x4| that are within
+  // this Tile. If |save_symbol_decoder_context| is true, then
+  // SaveSymbolDecoderContext() is invoked for the last superblock row.
+  template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+  bool ProcessSuperBlockRow(int row4x4, TileScratchBuffer* scratch_buffer);
+
+  const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+  const ObuFrameHeader& frame_header() const { return frame_header_; }
+  const RefCountedBuffer& current_frame() const { return current_frame_; }
+  const TemporalMotionField& motion_field() const { return motion_field_; }
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias()
+      const {
+    return reference_frame_sign_bias_;
+  }
+
+  bool IsRow4x4Inside(int row4x4) const {
+    return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_;
+  }
+
+  // 5.11.51.
+  bool IsInside(int row4x4, int column4x4) const {
+    return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ &&
+           column4x4 < column4x4_end_;
+  }
+
+  bool IsLeftInside(int column4x4) const {
+    // We use "larger than" as the condition. Don't pass in the left column
+    // offset column4x4 - 1.
+    assert(column4x4 <= column4x4_end_);
+    return column4x4 > column4x4_start_;
+  }
+
+  bool IsTopInside(int row4x4) const {
+    // We use "larger than" as the condition. Don't pass in the top row offset
+    // row4x4 - 1.
+    assert(row4x4 <= row4x4_end_);
+    return row4x4 > row4x4_start_;
+  }
+
+  bool IsTopLeftInside(int row4x4, int column4x4) const {
+    // We use "larger than" as the condition. Don't pass in the top row offset
+    // row4x4 - 1 or the left column offset column4x4 - 1.
+    assert(row4x4 <= row4x4_end_);
+    assert(column4x4 <= column4x4_end_);
+    return row4x4 > row4x4_start_ && column4x4 > column4x4_start_;
+  }
+
+  bool IsBottomRightInside(int row4x4, int column4x4) const {
+    assert(row4x4 >= row4x4_start_);
+    assert(column4x4 >= column4x4_start_);
+    return row4x4 < row4x4_end_ && column4x4 < column4x4_end_;
+  }
+
+  BlockParameters** BlockParametersAddress(int row4x4, int column4x4) const {
+    return block_parameters_holder_.Address(row4x4, column4x4);
+  }
+
+  int BlockParametersStride() const {
+    return block_parameters_holder_.columns4x4();
+  }
+
+  // Returns true if Parameters() can be called with |row| and |column| as
+  // inputs, false otherwise.
+  bool HasParameters(int row, int column) const {
+    return block_parameters_holder_.Find(row, column) != nullptr;
+  }
+  const BlockParameters& Parameters(int row, int column) const {
+    return *block_parameters_holder_.Find(row, column);
+  }
+
+  int number() const { return number_; }
+  int superblock_rows() const { return superblock_rows_; }
+  int superblock_columns() const { return superblock_columns_; }
+  int row4x4_start() const { return row4x4_start_; }
+  int column4x4_start() const { return column4x4_start_; }
+  int column4x4_end() const { return column4x4_end_; }
+
+ private:
+  // Stores the transform tree state when reading variable size transform trees
+  // and when applying the transform tree. When applying the transform tree,
+  // |depth| is not used.
+  struct TransformTreeNode {
+    // The default constructor is invoked by the Stack<TransformTreeNode, n>
+    // constructor. Stack<> does not use the default-constructed elements, so it
+    // is safe for the default constructor to not initialize the members.
+    TransformTreeNode() = default;
+    TransformTreeNode(int x, int y, TransformSize tx_size, int depth = -1)
+        : x(x), y(y), tx_size(tx_size), depth(depth) {}
+
+    int x;
+    int y;
+    TransformSize tx_size;
+    int depth;
+  };
+
+  // Enum to track the processing state of a superblock.
+  enum SuperBlockState : uint8_t {
+    kSuperBlockStateNone,       // Not yet parsed or decoded.
+    kSuperBlockStateParsed,     // Parsed but not yet decoded.
+    kSuperBlockStateScheduled,  // Scheduled for decoding.
+    kSuperBlockStateDecoded     // Parsed and decoded.
+  };
+
+  // Parameters used to facilitate multi-threading within the Tile.
+  struct ThreadingParameters {
+    std::mutex mutex;
+    // 2d array of size |superblock_rows_| by |superblock_columns_| containing
+    // the processing state of each superblock.
+    Array2D<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
+    // Variable used to indicate either parse or decode failure.
+    bool abort LIBGAV1_GUARDED_BY(mutex) = false;
+    int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0;
+    std::condition_variable pending_jobs_zero_condvar;
+  };
+
+  // The residual pointer is used to traverse the |residual_buffer_|. It is
+  // used in two different ways.
+  // If |split_parse_and_decode_| is true:
+  //    The pointer points to the beginning of the |residual_buffer_| when the
+  //    "parse" and "decode" steps begin. It is then moved forward tx_size in
+  //    each iteration of the "parse" and the "decode" steps. In this case, the
+  //    ResidualPtr variable passed into various functions starting from
+  //    ProcessSuperBlock is used as an in/out parameter to keep track of the
+  //    residual pointer.
+  // If |split_parse_and_decode_| is false:
+  //    The pointer is reset to the beginning of the |residual_buffer_| for
+  //    every transform block.
+  using ResidualPtr = uint8_t*;
+
+  Tile(int tile_number, const uint8_t* data, size_t size,
+       const ObuSequenceHeader& sequence_header,
+       const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
+       const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
+       const WedgeMaskArray& wedge_masks,
+       const QuantizerMatrix& quantizer_matrix,
+       SymbolDecoderContext* saved_symbol_decoder_context,
+       const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
+       const dsp::Dsp* dsp, ThreadPool* thread_pool,
+       BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+       bool use_intra_prediction_buffer);
+
+  // Performs member initializations that may fail. Helper function used by
+  // Create().
+  LIBGAV1_MUST_USE_RESULT bool Init();
+
+  // Saves the symbol decoder context of this tile into
+  // |saved_symbol_decoder_context_| if necessary.
+  void SaveSymbolDecoderContext();
+
+  // Entry point for multi-threaded decoding. This function performs the same
+  // functionality as ParseAndDecode(). The current thread does the "parse" step
+  // while the worker threads do the "decode" step.
+  bool ThreadedParseAndDecode();
+
+  // Returns whether or not the prerequisites for decoding the superblock at
+  // |row_index| and |column_index| are satisfied. |threading_.mutex| must be
+  // held when calling this function.
+  bool CanDecode(int row_index, int column_index) const;
+
+  // This function is run by the worker threads when multi-threaded decoding is
+  // enabled. Once a superblock is decoded, this function will set the
+  // corresponding |threading_.sb_state| entry to kSuperBlockStateDecoded. On
+  // failure, |threading_.abort| will be set to true. If at any point
+  // |threading_.abort| becomes true, this function will return as early as it
+  // can. If the decoding succeeds, this function will also schedule the
+  // decoding jobs for the superblock to the bottom-left and the superblock to
+  // the right of this superblock (if it is allowed).
+  void DecodeSuperBlock(int row_index, int column_index, int block_width4x4);
+
+  // If |use_intra_prediction_buffer_| is true, then this function copies the
+  // last row of the superblockrow starting at |row4x4| into the
+  // |intra_prediction_buffer_| (which may be used by the intra prediction
+  // process for the next superblock row).
+  void PopulateIntraPredictionBuffer(int row4x4);
+
+  uint16_t* GetPartitionCdf(int row4x4, int column4x4, BlockSize block_size);
+  bool ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+                     bool has_rows, bool has_columns, Partition* partition);
+  // Processes the Partition starting at |row4x4_start|, |column4x4_start|
+  // iteratively. It performs a DFS traversal over the partition tree to process
+  // the blocks in the right order.
+  bool ProcessPartition(
+      int row4x4_start, int column4x4_start, ParameterTree* root,
+      TileScratchBuffer* scratch_buffer,
+      ResidualPtr* residual);  // Iterative implementation of 5.11.4.
+  bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+                    ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+                    ResidualPtr* residual);   // 5.11.5.
+  void ResetCdef(int row4x4, int column4x4);  // 5.11.55.
+
+  // This function is used to decode a superblock when the parsing has already
+  // been done for that superblock.
+  bool DecodeSuperBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+                        ResidualPtr* residual);
+  // Helper function used by DecodeSuperBlock(). Note that the decode_block()
+  // function in the spec is equivalent to ProcessBlock() in the code.
+  bool DecodeBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+                   ResidualPtr* residual);
+
+  void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
+                         int column4x4);  // 5.11.3.
+  bool ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+                         TileScratchBuffer* scratch_buffer,
+                         ProcessingMode mode);
+  void ResetLoopRestorationParams();
+  void ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+                                       BlockSize block_size);  // 5.11.57.
+
+  // Helper functions for DecodeBlock.
+  bool ReadSegmentId(const Block& block);       // 5.11.9.
+  bool ReadIntraSegmentId(const Block& block);  // 5.11.8.
+  void ReadSkip(const Block& block);            // 5.11.11.
+  void ReadSkipMode(const Block& block);        // 5.11.10.
+  void ReadCdef(const Block& block);            // 5.11.56.
+  // Returns the new value. |cdf| is an array of size kDeltaSymbolCount + 1.
+  int ReadAndClipDelta(uint16_t* cdf, int delta_small, int scale, int min_value,
+                       int max_value, int value);
+  void ReadQuantizerIndexDelta(const Block& block);  // 5.11.12.
+  void ReadLoopFilterDelta(const Block& block);      // 5.11.13.
+  // Populates |BlockParameters::deblock_filter_level| for the given |block|
+  // using |deblock_filter_levels_|.
+  void PopulateDeblockFilterLevel(const Block& block);
+  void ReadPredictionModeY(const Block& block, bool intra_y_mode);
+  void ReadIntraAngleInfo(const Block& block,
+                          PlaneType plane_type);  // 5.11.42 and 5.11.43.
+  void ReadPredictionModeUV(const Block& block);
+  void ReadCflAlpha(const Block& block);  // 5.11.45.
+  int GetPaletteCache(const Block& block, PlaneType plane_type,
+                      uint16_t* cache);
+  void ReadPaletteColors(const Block& block, Plane plane);
+  void ReadPaletteModeInfo(const Block& block);      // 5.11.46.
+  void ReadFilterIntraModeInfo(const Block& block);  // 5.11.24.
+  int ReadMotionVectorComponent(const Block& block,
+                                int component);                // 5.11.32.
+  void ReadMotionVector(const Block& block, int index);        // 5.11.31.
+  bool DecodeIntraModeInfo(const Block& block);                // 5.11.7.
+  int8_t ComputePredictedSegmentId(const Block& block) const;  // 5.11.21.
+  bool ReadInterSegmentId(const Block& block, bool pre_skip);  // 5.11.19.
+  void ReadIsInter(const Block& block);                        // 5.11.20.
+  bool ReadIntraBlockModeInfo(const Block& block,
+                              bool intra_y_mode);  // 5.11.22.
+  CompoundReferenceType ReadCompoundReferenceType(const Block& block);
+  template <bool is_single, bool is_backward, int index>
+  uint16_t* GetReferenceCdf(const Block& block, CompoundReferenceType type =
+                                                    kNumCompoundReferenceTypes);
+  void ReadReferenceFrames(const Block& block);  // 5.11.25.
+  void ReadInterPredictionModeY(const Block& block,
+                                const MvContexts& mode_contexts);
+  void ReadRefMvIndex(const Block& block);
+  void ReadInterIntraMode(const Block& block, bool is_compound);  // 5.11.28.
+  bool IsScaled(ReferenceFrameType type) const {  // Part of 5.11.27.
+    const int index =
+        frame_header_.reference_frame_index[type - kReferenceFrameLast];
+    return reference_frames_[index]->upscaled_width() != frame_header_.width ||
+           reference_frames_[index]->frame_height() != frame_header_.height;
+  }
+  void ReadMotionMode(const Block& block, bool is_compound);  // 5.11.27.
+  uint16_t* GetIsExplicitCompoundTypeCdf(const Block& block);
+  uint16_t* GetIsCompoundTypeAverageCdf(const Block& block);
+  void ReadCompoundType(const Block& block, bool is_compound);  // 5.11.29.
+  uint16_t* GetInterpolationFilterCdf(const Block& block, int direction);
+  void ReadInterpolationFilter(const Block& block);
+  bool ReadInterBlockModeInfo(const Block& block);             // 5.11.23.
+  bool DecodeInterModeInfo(const Block& block);                // 5.11.18.
+  bool DecodeModeInfo(const Block& block);                     // 5.11.6.
+  bool IsMvValid(const Block& block, bool is_compound) const;  // 6.10.25.
+  bool AssignInterMv(const Block& block, bool is_compound);    // 5.11.26.
+  bool AssignIntraMv(const Block& block);                      // 5.11.26.
+  int GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+                           bool ignore_skip);
+  int GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+                             bool ignore_skip);
+  TransformSize ReadFixedTransformSize(const Block& block);  // 5.11.15.
+  // Iterative implementation of 5.11.17.
+  void ReadVariableTransformTree(const Block& block, int row4x4, int column4x4,
+                                 TransformSize tx_size);
+  void DecodeTransformSize(const Block& block);  // 5.11.16.
+  bool ComputePrediction(const Block& block);    // 5.11.33.
+  // |x4| and |y4| are the column and row positions of the 4x4 block. |w4| and
+  // |h4| are the width and height in 4x4 units of |tx_size|.
+  int GetTransformAllZeroContext(const Block& block, Plane plane,
+                                 TransformSize tx_size, int x4, int y4, int w4,
+                                 int h4);
+  TransformSet GetTransformSet(TransformSize tx_size,
+                               bool is_inter) const;  // 5.11.48.
+  TransformType ComputeTransformType(const Block& block, Plane plane,
+                                     TransformSize tx_size, int block_x,
+                                     int block_y);  // 5.11.40.
+  void ReadTransformType(const Block& block, int x4, int y4,
+                         TransformSize tx_size);  // 5.11.47.
+  template <typename ResidualType>
+  void ReadCoeffBase2D(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  template <typename ResidualType>
+  void ReadCoeffBaseHorizontal(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  template <typename ResidualType>
+  void ReadCoeffBaseVertical(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  int GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane);
+  void SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+                          uint8_t coefficient_level, int8_t dc_category);
+  void InterIntraPrediction(
+      uint16_t* prediction_0, const uint8_t* prediction_mask,
+      ptrdiff_t prediction_mask_stride,
+      const PredictionParameters& prediction_parameters, int prediction_width,
+      int prediction_height, int subsampling_x, int subsampling_y,
+      uint8_t* dest,
+      ptrdiff_t dest_stride);  // Part of section 7.11.3.1 in the spec.
+  void CompoundInterPrediction(
+      const Block& block, const uint8_t* prediction_mask,
+      ptrdiff_t prediction_mask_stride, int prediction_width,
+      int prediction_height, int subsampling_x, int subsampling_y,
+      int candidate_row, int candidate_column, uint8_t* dest,
+      ptrdiff_t dest_stride);  // Part of section 7.11.3.1 in the spec.
+  GlobalMotion* GetWarpParams(const Block& block, Plane plane,
+                              int prediction_width, int prediction_height,
+                              const PredictionParameters& prediction_parameters,
+                              ReferenceFrameType reference_type,
+                              bool* is_local_valid,
+                              GlobalMotion* global_motion_params,
+                              GlobalMotion* local_warp_params)
+      const;  // Part of section 7.11.3.1 in the spec.
+  bool InterPrediction(const Block& block, Plane plane, int x, int y,
+                       int prediction_width, int prediction_height,
+                       int candidate_row, int candidate_column,
+                       bool* is_local_valid,
+                       GlobalMotion* local_warp_params);  // 7.11.3.1.
+  void ScaleMotionVector(const MotionVector& mv, Plane plane,
+                         int reference_frame_index, int x, int y, int* start_x,
+                         int* start_y, int* step_x, int* step_y);  // 7.11.3.3.
+  // If the method returns false, the caller only uses the output parameters
+  // *ref_block_start_x and *ref_block_start_y. If the method returns true, the
+  // caller uses all three output parameters.
+  static bool GetReferenceBlockPosition(
+      int reference_frame_index, bool is_scaled, int width, int height,
+      int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y,
+      int start_x, int start_y, int step_x, int step_y, int left_border,
+      int right_border, int top_border, int bottom_border,
+      int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x);
+
+  template <typename Pixel>
+  void BuildConvolveBlock(Plane plane, int reference_frame_index,
+                          bool is_scaled, int height, int ref_start_x,
+                          int ref_last_x, int ref_start_y, int ref_last_y,
+                          int step_y, int ref_block_start_x,
+                          int ref_block_end_x, int ref_block_start_y,
+                          uint8_t* block_buffer,
+                          ptrdiff_t convolve_buffer_stride,
+                          ptrdiff_t block_extended_width);
+  bool BlockInterPrediction(const Block& block, Plane plane,
+                            int reference_frame_index, const MotionVector& mv,
+                            int x, int y, int width, int height,
+                            int candidate_row, int candidate_column,
+                            uint16_t* prediction, bool is_compound,
+                            bool is_inter_intra, uint8_t* dest,
+                            ptrdiff_t dest_stride);  // 7.11.3.4.
+  bool BlockWarpProcess(const Block& block, Plane plane, int index,
+                        int block_start_x, int block_start_y, int width,
+                        int height, GlobalMotion* warp_params, bool is_compound,
+                        bool is_inter_intra, uint8_t* dest,
+                        ptrdiff_t dest_stride);  // 7.11.3.5.
+  bool ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+                           Plane plane, int reference_frame_index, int width,
+                           int height, int x, int y, int candidate_row,
+                           int candidate_column,
+                           ObmcDirection blending_direction);
+  bool ObmcPrediction(const Block& block, Plane plane, int width,
+                      int height);  // 7.11.3.9.
+  void DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+                                  int width, int height, int candidate_row,
+                                  int candidate_column, uint8_t* dest,
+                                  ptrdiff_t dest_stride);  // 7.11.3.15.
+  // This function specializes the parsing of DC coefficient by removing some of
+  // the branches when i == 0 (since scan[0] is always 0 and scan[i] is always
+  // non-zero for all other possible values of i). |dc_category| is an output
+  // parameter that is populated when |is_dc_coefficient| is true.
+  // |coefficient_level| is an output parameter which accumulates the
+  // coefficient level.
+  template <typename ResidualType, bool is_dc_coefficient>
+  LIBGAV1_ALWAYS_INLINE bool ReadSignAndApplyDequantization(
+      const uint16_t* scan, int i, int q_value, const uint8_t* quantizer_matrix,
+      int shift, int max_value, uint16_t* dc_sign_cdf, int8_t* dc_category,
+      int* coefficient_level,
+      ResidualType* residual_buffer);     // Part of 5.11.39.
+  int ReadCoeffBaseRange(uint16_t* cdf);  // Part of 5.11.39.
+  // Returns the number of non-zero coefficients that were read. |tx_type| is an
+  // output parameter that stores the computed transform type for the plane
+  // whose coefficients were read. Returns -1 on failure.
+  template <typename ResidualType>
+  int ReadTransformCoefficients(const Block& block, Plane plane, int start_x,
+                                int start_y, TransformSize tx_size,
+                                TransformType* tx_type);  // 5.11.39.
+  bool TransformBlock(const Block& block, Plane plane, int base_x, int base_y,
+                      TransformSize tx_size, int x, int y,
+                      ProcessingMode mode);  // 5.11.35.
+  // Iterative implementation of 5.11.36.
+  bool TransformTree(const Block& block, int start_x, int start_y,
+                     BlockSize plane_size, ProcessingMode mode);
+  void ReconstructBlock(const Block& block, Plane plane, int start_x,
+                        int start_y, TransformSize tx_size,
+                        TransformType tx_type,
+                        int non_zero_coeff_count);         // Part of 7.12.3.
+  bool Residual(const Block& block, ProcessingMode mode);  // 5.11.34.
+  // part of 5.11.5 (reset_block_context() in the spec).
+  void ResetEntropyContext(const Block& block);
+  // Populates the |color_context| and |color_order| for the |i|th iteration
+  // with entries counting down from |start| to |end| (|start| > |end|).
+  void PopulatePaletteColorContexts(
+      const Block& block, PlaneType plane_type, int i, int start, int end,
+      uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+      uint8_t color_context[kMaxPaletteSquare]);  // 5.11.50.
+  bool ReadPaletteTokens(const Block& block);     // 5.11.49.
+  template <typename Pixel>
+  void IntraPrediction(const Block& block, Plane plane, int x, int y,
+                       bool has_left, bool has_top, bool has_top_right,
+                       bool has_bottom_left, PredictionMode mode,
+                       TransformSize tx_size);
+  bool IsSmoothPrediction(int row, int column, Plane plane) const;
+  int GetIntraEdgeFilterType(const Block& block,
+                             Plane plane) const;  // 7.11.2.8.
+  template <typename Pixel>
+  void DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+                             bool has_left, bool has_top, bool needs_left,
+                             bool needs_top, int prediction_angle, int width,
+                             int height, int max_x, int max_y,
+                             TransformSize tx_size, Pixel* top_row,
+                             Pixel* left_column);  // 7.11.2.4.
+  template <typename Pixel>
+  void PalettePrediction(const Block& block, Plane plane, int start_x,
+                         int start_y, int x, int y,
+                         TransformSize tx_size);  // 7.11.4.
+  template <typename Pixel>
+  void ChromaFromLumaPrediction(const Block& block, Plane plane, int start_x,
+                                int start_y,
+                                TransformSize tx_size);  // 7.11.5.
+  // Section 7.19. Applies some filtering and reordering to the motion vectors
+  // for the given |block| and stores them into |current_frame_|.
+  void StoreMotionFieldMvsIntoCurrentFrame(const Block& block);
+
+  // Returns the zero-based index of the super block that contains |row4x4|
+  // relative to the start of this tile.
+  int SuperBlockRowIndex(int row4x4) const {
+    return (row4x4 - row4x4_start_) >>
+           (sequence_header_.use_128x128_superblock ? 5 : 4);
+  }
+
+  // Returns the zero-based index of the super block that contains |column4x4|
+  // relative to the start of this tile.
+  int SuperBlockColumnIndex(int column4x4) const {
+    return (column4x4 - column4x4_start_) >>
+           (sequence_header_.use_128x128_superblock ? 5 : 4);
+  }
+
+  BlockSize SuperBlockSize() const {
+    return sequence_header_.use_128x128_superblock ? kBlock128x128
+                                                   : kBlock64x64;
+  }
+  int PlaneCount() const {
+    return sequence_header_.color_config.is_monochrome ? kMaxPlanesMonochrome
+                                                       : kMaxPlanes;
+  }
+
+  const int number_;
+  const int row_;
+  const int column_;
+  const uint8_t* const data_;
+  size_t size_;
+  int row4x4_start_;
+  int row4x4_end_;
+  int column4x4_start_;
+  int column4x4_end_;
+  int superblock_rows_;
+  int superblock_columns_;
+  bool read_deltas_;
+  const int8_t subsampling_x_[kMaxPlanes];
+  const int8_t subsampling_y_[kMaxPlanes];
+  int deblock_row_limit_[kMaxPlanes];
+  int deblock_column_limit_[kMaxPlanes];
+
+  // The dimensions (in order) are: segment_id, level_index (based on plane and
+  // direction), reference_frame and mode_id.
+  uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+                                [kNumReferenceFrameTypes][2];
+
+  // current_quantizer_index_ is in the range [0, 255].
+  uint8_t current_quantizer_index_;
+  // These two arrays (|coefficient_levels_| and |dc_categories_|) are used to
+  // store the entropy context. Their dimensions are as follows: First -
+  // left/top; Second - plane; Third - row4x4 (if first dimension is
+  // left)/column4x4 (if first dimension is top).
+  //
+  // This is equivalent to the LeftLevelContext and AboveLevelContext arrays in
+  // the spec. In the spec, it stores values from 0 through 63 (inclusive). The
+  // stored values are used to compute the left and top contexts in
+  // GetTransformAllZeroContext. In that function, we only care about the
+  // following values: 0, 1, 2, 3 and >= 4. So instead of clamping to 63, we
+  // clamp to 4 (i.e.) all the values greater than 4 are stored as 4.
+  std::array<Array2D<uint8_t>, 2> coefficient_levels_;
+  // This is equivalent to the LeftDcContext and AboveDcContext arrays in the
+  // spec. In the spec, it can store 3 possible values: 0, 1 and 2 (where 1
+  // means the value is < 0, 2 means the value is > 0 and 0 means the value is
+  // equal to 0).
+  //
+  // The stored values are used in two places:
+  //  * GetTransformAllZeroContext: Here, we only care about whether the
+  //  value is 0 or not (whether it is 1 or 2 is irrelevant).
+  //  * GetDcSignContext: Here, we do the following computation: if the
+  //  stored value is 1, we decrement a counter. If the stored value is 2
+  //  we increment a counter.
+  //
+  // Based on this usage, we can simply replace 1 with -1 and 2 with 1 and
+  // use that value to compute the counter.
+  //
+  // The usage on GetTransformAllZeroContext is unaffected since there we
+  // only care about whether it is 0 or not.
+  std::array<Array2D<int8_t>, 2> dc_categories_;
+  const ObuSequenceHeader& sequence_header_;
+  const ObuFrameHeader& frame_header_;
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias_;
+  const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+      reference_frames_;
+  TemporalMotionField& motion_field_;
+  const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint_;
+  const WedgeMaskArray& wedge_masks_;
+  const QuantizerMatrix& quantizer_matrix_;
+  DaalaBitReader reader_;
+  SymbolDecoderContext symbol_decoder_context_;
+  SymbolDecoderContext* const saved_symbol_decoder_context_;
+  const SegmentationMap* prev_segment_ids_;
+  const dsp::Dsp& dsp_;
+  PostFilter& post_filter_;
+  BlockParametersHolder& block_parameters_holder_;
+  Quantizer quantizer_;
+  // When there is no multi-threading within the Tile, |residual_buffer_| is
+  // used. When there is multi-threading within the Tile,
+  // |residual_buffer_threaded_| is used. In the following comment,
+  // |residual_buffer| refers to either |residual_buffer_| or
+  // |residual_buffer_threaded_| depending on whether multi-threading is enabled
+  // within the Tile or not.
+  // The |residual_buffer| is used to help with the dequantization and the
+  // inverse transform processes. It is declared as a uint8_t, but is always
+  // accessed either as an int16_t or int32_t depending on |bitdepth|. Here is
+  // what it stores at various stages of the decoding process (in the order
+  // which they happen):
+  //   1) In ReadTransformCoefficients(), this buffer is used to store the
+  //   dequantized values.
+  //   2) In Reconstruct(), this buffer is used as the input to the row
+  //   transform process.
+  // The size of this buffer would be:
+  //    For |residual_buffer_|: (4096 + 32 * |kResidualPaddingVertical|) *
+  //        |residual_size_|. Where 4096 = 64x64 which is the maximum transform
+  //        size, and 32 * |kResidualPaddingVertical| is the padding to avoid
+  //        bottom boundary checks when parsing quantized coefficients. This
+  //        memory is allocated and owned by the Tile class.
+  //    For |residual_buffer_threaded_|: See the comment below. This memory is
+  //        not allocated or owned by the Tile class.
+  AlignedUniquePtr<uint8_t> residual_buffer_;
+  // This is a 2d array of pointers of size |superblock_rows_| by
+  // |superblock_columns_| where each pointer points to a ResidualBuffer for a
+  // single super block. The array is populated when the parsing process begins
+  // by calling |residual_buffer_pool_->Get()| and the memory is released back
+  // to the pool by calling |residual_buffer_pool_->Release()| when the decoding
+  // process is complete.
+  Array2D<std::unique_ptr<ResidualBuffer>> residual_buffer_threaded_;
+  // sizeof(int16_t or int32_t) depending on |bitdepth|.
+  const size_t residual_size_;
+  // Number of superblocks on the top-right that will have to be decoded before
+  // the current superblock can be decoded. This will be 1 if allow_intrabc is
+  // false. If allow_intrabc is true, then this value will be
+  // use_128x128_superblock ? 3 : 5. This is the allowed range of reference for
+  // the top rows for intrabc.
+  const int intra_block_copy_lag_;
+
+  // In the Tile class, we use the "current_frame" in two ways:
+  //   1) To write the decoded output into (using the |buffer_| view).
+  //   2) To read the pixels for intra block copy (using the |current_frame_|
+  //      reference).
+  //
+  // When intra block copy is off, |buffer_| and |current_frame_| may or may not
+  // point to the same plane pointers. But it is okay since |current_frame_| is
+  // never used in this case.
+  //
+  // When intra block copy is on, |buffer_| and |current_frame_| always point to
+  // the same plane pointers (since post filtering is disabled). So the usage in
+  // both case 1 and case 2 remain valid.
+  Array2DView<uint8_t> buffer_[kMaxPlanes];
+  RefCountedBuffer& current_frame_;
+
+  Array2D<int16_t>& cdef_index_;
+  Array2D<TransformSize>& inter_transform_sizes_;
+  std::array<RestorationUnitInfo, kMaxPlanes> reference_unit_info_;
+  // If |thread_pool_| is nullptr, the calling thread will do the parsing and
+  // the decoding in one pass. If |thread_pool_| is not nullptr, then the main
+  // thread will do the parsing while the thread pool workers will do the
+  // decoding.
+  ThreadPool* const thread_pool_;
+  ThreadingParameters threading_;
+  ResidualBufferPool* const residual_buffer_pool_;
+  TileScratchBufferPool* const tile_scratch_buffer_pool_;
+  BlockingCounterWithStatus* const pending_tiles_;
+  bool split_parse_and_decode_;
+  // This is used only when |split_parse_and_decode_| is false.
+  std::unique_ptr<PredictionParameters> prediction_parameters_ = nullptr;
+  // Stores the |transform_type| for the super block being decoded at a 4x4
+  // granularity. The spec uses absolute indices for this array but it is
+  // sufficient to use indices relative to the super block being decoded.
+  TransformType transform_types_[32][32];
+  // delta_lf_[i] is in the range [-63, 63].
+  int8_t delta_lf_[kFrameLfCount];
+  // True if all the values in |delta_lf_| are zero. False otherwise.
+  bool delta_lf_all_zero_;
+  const bool frame_parallel_;
+  const bool use_intra_prediction_buffer_;
+  // Buffer used to store the unfiltered pixels that are necessary for decoding
+  // the next superblock row (for the intra prediction process). Used only if
+  // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains
+  // one row buffer for each tile row. This tile will have to use the buffer
+  // corresponding to this tile's row.
+  IntraPredictionBuffer* const intra_prediction_buffer_;
+  // Stores the progress of the reference frames. This will be used to avoid
+  // unnecessary calls into RefCountedBuffer::WaitUntil().
+  std::array<int, kNumReferenceFrameTypes> reference_frame_progress_cache_;
+};
+
+struct Tile::Block {
+  Block(const Tile& tile, BlockSize size, int row4x4, int column4x4,
+        TileScratchBuffer* const scratch_buffer, ResidualPtr* residual)
+      : tile(tile),
+        size(size),
+        row4x4(row4x4),
+        column4x4(column4x4),
+        width(kBlockWidthPixels[size]),
+        height(kBlockHeightPixels[size]),
+        width4x4(width >> 2),
+        height4x4(height >> 2),
+        scratch_buffer(scratch_buffer),
+        residual(residual) {
+    assert(size != kBlockInvalid);
+    residual_size[kPlaneY] = kPlaneResidualSize[size][0][0];
+    residual_size[kPlaneU] = residual_size[kPlaneV] =
+        kPlaneResidualSize[size][tile.subsampling_x_[kPlaneU]]
+                          [tile.subsampling_y_[kPlaneU]];
+    assert(residual_size[kPlaneY] != kBlockInvalid);
+    if (tile.PlaneCount() > 1) {
+      assert(residual_size[kPlaneU] != kBlockInvalid);
+    }
+    if ((row4x4 & 1) == 0 &&
+        (tile.sequence_header_.color_config.subsampling_y & height4x4) == 1) {
+      has_chroma = false;
+    } else if ((column4x4 & 1) == 0 &&
+               (tile.sequence_header_.color_config.subsampling_x & width4x4) ==
+                   1) {
+      has_chroma = false;
+    } else {
+      has_chroma = !tile.sequence_header_.color_config.is_monochrome;
+    }
+    top_available[kPlaneY] = tile.IsTopInside(row4x4);
+    left_available[kPlaneY] = tile.IsLeftInside(column4x4);
+    if (has_chroma) {
+      // top_available[kPlaneU] and top_available[kPlaneV] are valid only if
+      // has_chroma is true.
+      // The next 3 lines are equivalent to:
+      // top_available[kPlaneU] = top_available[kPlaneV] =
+      //     top_available[kPlaneY] &&
+      //     ((tile.sequence_header_.color_config.subsampling_y & height4x4) ==
+      //     0 || tile.IsTopInside(row4x4 - 1));
+      top_available[kPlaneU] = top_available[kPlaneV] = tile.IsTopInside(
+          row4x4 -
+          (tile.sequence_header_.color_config.subsampling_y & height4x4));
+      // left_available[kPlaneU] and left_available[kPlaneV] are valid only if
+      // has_chroma is true.
+      // The next 3 lines are equivalent to:
+      // left_available[kPlaneU] = left_available[kPlaneV] =
+      //     left_available[kPlaneY] &&
+      //     ((tile.sequence_header_.color_config.subsampling_x & width4x4) == 0
+      //      || tile.IsLeftInside(column4x4 - 1));
+      left_available[kPlaneU] = left_available[kPlaneV] = tile.IsLeftInside(
+          column4x4 -
+          (tile.sequence_header_.color_config.subsampling_x & width4x4));
+    }
+    const ptrdiff_t stride = tile.BlockParametersStride();
+    BlockParameters** const bps =
+        tile.BlockParametersAddress(row4x4, column4x4);
+    bp = *bps;
+    // bp_top is valid only if top_available[kPlaneY] is true.
+    if (top_available[kPlaneY]) {
+      bp_top = *(bps - stride);
+    }
+    // bp_left is valid only if left_available[kPlaneY] is true.
+    if (left_available[kPlaneY]) {
+      bp_left = *(bps - 1);
+    }
+  }
+
+  bool HasChroma() const { return has_chroma; }
+
+  // These return values of these group of functions are valid only if the
+  // corresponding top_available or left_available is true.
+  ReferenceFrameType TopReference(int index) const {
+    return bp_top->reference_frame[index];
+  }
+
+  ReferenceFrameType LeftReference(int index) const {
+    return bp_left->reference_frame[index];
+  }
+
+  bool IsTopIntra() const { return TopReference(0) <= kReferenceFrameIntra; }
+  bool IsLeftIntra() const { return LeftReference(0) <= kReferenceFrameIntra; }
+
+  bool IsTopSingle() const { return TopReference(1) <= kReferenceFrameIntra; }
+  bool IsLeftSingle() const { return LeftReference(1) <= kReferenceFrameIntra; }
+
+  int CountReferences(ReferenceFrameType type) const {
+    return static_cast<int>(top_available[kPlaneY] &&
+                            bp_top->reference_frame[0] == type) +
+           static_cast<int>(top_available[kPlaneY] &&
+                            bp_top->reference_frame[1] == type) +
+           static_cast<int>(left_available[kPlaneY] &&
+                            bp_left->reference_frame[0] == type) +
+           static_cast<int>(left_available[kPlaneY] &&
+                            bp_left->reference_frame[1] == type);
+  }
+
+  // 7.10.3.
+  // Checks if there are any inter blocks to the left or above. If so, it
+  // returns true indicating that the block has neighbors that are suitable for
+  // use by overlapped motion compensation.
+  bool HasOverlappableCandidates() const {
+    const ptrdiff_t stride = tile.BlockParametersStride();
+    BlockParameters** const bps = tile.BlockParametersAddress(0, 0);
+    if (top_available[kPlaneY]) {
+      BlockParameters** bps_top = bps + (row4x4 - 1) * stride + (column4x4 | 1);
+      const int columns = std::min(tile.frame_header_.columns4x4 - column4x4,
+                                   static_cast<int>(width4x4));
+      BlockParameters** const bps_top_end = bps_top + columns;
+      do {
+        if ((*bps_top)->reference_frame[0] > kReferenceFrameIntra) {
+          return true;
+        }
+        bps_top += 2;
+      } while (bps_top < bps_top_end);
+    }
+    if (left_available[kPlaneY]) {
+      BlockParameters** bps_left = bps + (row4x4 | 1) * stride + column4x4 - 1;
+      const int rows = std::min(tile.frame_header_.rows4x4 - row4x4,
+                                static_cast<int>(height4x4));
+      BlockParameters** const bps_left_end = bps_left + rows * stride;
+      do {
+        if ((*bps_left)->reference_frame[0] > kReferenceFrameIntra) {
+          return true;
+        }
+        bps_left += 2 * stride;
+      } while (bps_left < bps_left_end);
+    }
+    return false;
+  }
+
+  const Tile& tile;
+  bool has_chroma;
+  const BlockSize size;
+  bool top_available[kMaxPlanes];
+  bool left_available[kMaxPlanes];
+  BlockSize residual_size[kMaxPlanes];
+  const int row4x4;
+  const int column4x4;
+  const int width;
+  const int height;
+  const int width4x4;
+  const int height4x4;
+  const BlockParameters* bp_top;
+  const BlockParameters* bp_left;
+  BlockParameters* bp;
+  TileScratchBuffer* const scratch_buffer;
+  ResidualPtr* const residual;
+};
+
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_TILE_H_
diff --git a/src/tile/bitstream/mode_info.cc b/src/tile/bitstream/mode_info.cc
new file mode 100644
index 0000000..0b22eb0
--- /dev/null
+++ b/src/tile/bitstream/mode_info.cc
@@ -0,0 +1,1303 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kDeltaQSmall = 3;
+constexpr int kDeltaLfSmall = 3;
+
+constexpr uint8_t kIntraYModeContext[kIntraPredictionModesY] = {
+    0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0};
+
+constexpr uint8_t kSizeGroup[kMaxBlockSizes] = {
+    0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 3, 3};
+
+constexpr int kCompoundModeNewMvContexts = 5;
+constexpr uint8_t kCompoundModeContextMap[3][kCompoundModeNewMvContexts] = {
+    {0, 1, 1, 1, 1}, {1, 2, 3, 4, 4}, {4, 4, 5, 6, 7}};
+
+enum CflSign : uint8_t {
+  kCflSignZero = 0,
+  kCflSignNegative = 1,
+  kCflSignPositive = 2
+};
+
+// For each possible value of the combined signs (which is read from the
+// bitstream), this array stores the following: sign_u, sign_v, alpha_u_context,
+// alpha_v_context. Only positive entries are used. Entry at index i is computed
+// as follows:
+// sign_u = i / 3
+// sign_v = i % 3
+// alpha_u_context = i - 2
+// alpha_v_context = (sign_v - 1) * 3 + sign_u
+constexpr int8_t kCflAlphaLookup[kCflAlphaSignsSymbolCount][4] = {
+    {0, 1, -2, 0}, {0, 2, -1, 3}, {1, 0, 0, -2}, {1, 1, 1, 1},
+    {1, 2, 2, 4},  {2, 0, 3, -1}, {2, 1, 4, 2},  {2, 2, 5, 5},
+};
+
+constexpr BitMaskSet kPredictionModeHasNearMvMask(kPredictionModeNearMv,
+                                                  kPredictionModeNearNearMv,
+                                                  kPredictionModeNearNewMv,
+                                                  kPredictionModeNewNearMv);
+
+constexpr BitMaskSet kIsInterIntraModeAllowedMask(kBlock8x8, kBlock8x16,
+                                                  kBlock16x8, kBlock16x16,
+                                                  kBlock16x32, kBlock32x16,
+                                                  kBlock32x32);
+
+bool IsBackwardReference(ReferenceFrameType type) {
+  return type >= kReferenceFrameBackward && type <= kReferenceFrameAlternate;
+}
+
+bool IsSameDirectionReferencePair(ReferenceFrameType type1,
+                                  ReferenceFrameType type2) {
+  return (type1 >= kReferenceFrameBackward) ==
+         (type2 >= kReferenceFrameBackward);
+}
+
+// This is called neg_deinterleave() in the spec.
+int DecodeSegmentId(int diff, int reference, int max) {
+  if (reference == 0) return diff;
+  if (reference >= max - 1) return max - diff - 1;
+  const int value = ((diff & 1) != 0) ? reference + ((diff + 1) >> 1)
+                                      : reference - (diff >> 1);
+  const int reference2 = (reference << 1);
+  if (reference2 < max) {
+    return (diff <= reference2) ? value : diff;
+  }
+  return (diff <= ((max - reference - 1) << 1)) ? value : max - (diff + 1);
+}
+
+// This is called DrlCtxStack in section 7.10.2.14 of the spec.
+// In the spec, the weights of all the nearest mvs are incremented by a bonus
+// weight which is larger than any natural weight, and the weights of the mvs
+// are compared with this bonus weight to determine their contexts. We replace
+// this procedure by introducing |nearest_mv_count| in PredictionParameters,
+// which records the count of the nearest mvs. Since all the nearest mvs are in
+// the beginning of the mv stack, the |index| of a mv in the mv stack can be
+// compared with |nearest_mv_count| to get that mv's context.
+int GetRefMvIndexContext(int nearest_mv_count, int index) {
+  if (index + 1 < nearest_mv_count) {
+    return 0;
+  }
+  if (index + 1 == nearest_mv_count) {
+    return 1;
+  }
+  return 2;
+}
+
+// Returns true if both the width and height of the block is less than 64.
+bool IsBlockDimensionLessThan64(BlockSize size) {
+  return size <= kBlock32x32 && size != kBlock16x64;
+}
+
+int GetUseCompoundReferenceContext(const Tile::Block& block) {
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    if (block.IsTopSingle() && block.IsLeftSingle()) {
+      return static_cast<int>(IsBackwardReference(block.TopReference(0))) ^
+             static_cast<int>(IsBackwardReference(block.LeftReference(0)));
+    }
+    if (block.IsTopSingle()) {
+      return 2 + static_cast<int>(IsBackwardReference(block.TopReference(0)) ||
+                                  block.IsTopIntra());
+    }
+    if (block.IsLeftSingle()) {
+      return 2 + static_cast<int>(IsBackwardReference(block.LeftReference(0)) ||
+                                  block.IsLeftIntra());
+    }
+    return 4;
+  }
+  if (block.top_available[kPlaneY]) {
+    return block.IsTopSingle()
+               ? static_cast<int>(IsBackwardReference(block.TopReference(0)))
+               : 3;
+  }
+  if (block.left_available[kPlaneY]) {
+    return block.IsLeftSingle()
+               ? static_cast<int>(IsBackwardReference(block.LeftReference(0)))
+               : 3;
+  }
+  return 1;
+}
+
+// Calculates count0 by calling block.CountReferences() on the frame types from
+// type0_start to type0_end, inclusive, and summing the results.
+// Calculates count1 by calling block.CountReferences() on the frame types from
+// type1_start to type1_end, inclusive, and summing the results.
+// Compares count0 with count1 and returns 0, 1 or 2.
+//
+// See count_refs and ref_count_ctx in 8.3.2.
+int GetReferenceContext(const Tile::Block& block,
+                        ReferenceFrameType type0_start,
+                        ReferenceFrameType type0_end,
+                        ReferenceFrameType type1_start,
+                        ReferenceFrameType type1_end) {
+  int count0 = 0;
+  int count1 = 0;
+  for (int type = type0_start; type <= type0_end; ++type) {
+    count0 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+  }
+  for (int type = type1_start; type <= type1_end; ++type) {
+    count1 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+  }
+  return (count0 < count1) ? 0 : (count0 == count1 ? 1 : 2);
+}
+
+}  // namespace
+
+bool Tile::ReadSegmentId(const Block& block) {
+  int top_left = -1;
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    top_left =
+        block_parameters_holder_.Find(block.row4x4 - 1, block.column4x4 - 1)
+            ->segment_id;
+  }
+  int top = -1;
+  if (block.top_available[kPlaneY]) {
+    top = block.bp_top->segment_id;
+  }
+  int left = -1;
+  if (block.left_available[kPlaneY]) {
+    left = block.bp_left->segment_id;
+  }
+  int pred;
+  if (top == -1) {
+    pred = (left == -1) ? 0 : left;
+  } else if (left == -1) {
+    pred = top;
+  } else {
+    pred = (top_left == top) ? top : left;
+  }
+  BlockParameters& bp = *block.bp;
+  if (bp.skip) {
+    bp.segment_id = pred;
+    return true;
+  }
+  int context = 0;
+  if (top_left < 0) {
+    context = 0;
+  } else if (top_left == top && top_left == left) {
+    context = 2;
+  } else if (top_left == top || top_left == left || top == left) {
+    context = 1;
+  }
+  uint16_t* const segment_id_cdf =
+      symbol_decoder_context_.segment_id_cdf[context];
+  const int encoded_segment_id =
+      reader_.ReadSymbol<kMaxSegments>(segment_id_cdf);
+  bp.segment_id =
+      DecodeSegmentId(encoded_segment_id, pred,
+                      frame_header_.segmentation.last_active_segment_id + 1);
+  // Check the bitstream conformance requirement in Section 6.10.8 of the spec.
+  if (bp.segment_id < 0 ||
+      bp.segment_id > frame_header_.segmentation.last_active_segment_id) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "Corrupted segment_ids: encoded %d, last active %d, postprocessed %d",
+        encoded_segment_id, frame_header_.segmentation.last_active_segment_id,
+        bp.segment_id);
+    return false;
+  }
+  return true;
+}
+
+bool Tile::ReadIntraSegmentId(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.segmentation.enabled) {
+    bp.segment_id = 0;
+    return true;
+  }
+  return ReadSegmentId(block);
+}
+
+void Tile::ReadSkip(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.segmentation.segment_id_pre_skip &&
+      frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureSkip)) {
+    bp.skip = true;
+    return;
+  }
+  int context = 0;
+  if (block.top_available[kPlaneY] && block.bp_top->skip) {
+    ++context;
+  }
+  if (block.left_available[kPlaneY] && block.bp_left->skip) {
+    ++context;
+  }
+  uint16_t* const skip_cdf = symbol_decoder_context_.skip_cdf[context];
+  bp.skip = reader_.ReadSymbol(skip_cdf);
+}
+
+void Tile::ReadSkipMode(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.skip_mode_present ||
+      frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureReferenceFrame) ||
+      frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureGlobalMv) ||
+      IsBlockDimension4(block.size)) {
+    bp.skip_mode = false;
+    return;
+  }
+  const int context =
+      (block.left_available[kPlaneY]
+           ? static_cast<int>(block.bp_left->skip_mode)
+           : 0) +
+      (block.top_available[kPlaneY] ? static_cast<int>(block.bp_top->skip_mode)
+                                    : 0);
+  bp.skip_mode =
+      reader_.ReadSymbol(symbol_decoder_context_.skip_mode_cdf[context]);
+}
+
+void Tile::ReadCdef(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (bp.skip || frame_header_.coded_lossless ||
+      !sequence_header_.enable_cdef || frame_header_.allow_intrabc) {
+    return;
+  }
+  const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
+  const int cdef_mask4x4 = ~(cdef_size4x4 - 1);
+  const int row4x4 = block.row4x4 & cdef_mask4x4;
+  const int column4x4 = block.column4x4 & cdef_mask4x4;
+  const int row = DivideBy16(row4x4);
+  const int column = DivideBy16(column4x4);
+  if (cdef_index_[row][column] == -1) {
+    cdef_index_[row][column] =
+        frame_header_.cdef.bits > 0
+            ? static_cast<int16_t>(reader_.ReadLiteral(frame_header_.cdef.bits))
+            : 0;
+    for (int i = row4x4; i < row4x4 + block.height4x4; i += cdef_size4x4) {
+      for (int j = column4x4; j < column4x4 + block.width4x4;
+           j += cdef_size4x4) {
+        cdef_index_[DivideBy16(i)][DivideBy16(j)] = cdef_index_[row][column];
+      }
+    }
+  }
+}
+
+int Tile::ReadAndClipDelta(uint16_t* const cdf, int delta_small, int scale,
+                           int min_value, int max_value, int value) {
+  int abs = reader_.ReadSymbol<kDeltaSymbolCount>(cdf);
+  if (abs == delta_small) {
+    const int remaining_bit_count =
+        static_cast<int>(reader_.ReadLiteral(3)) + 1;
+    const int abs_remaining_bits =
+        static_cast<int>(reader_.ReadLiteral(remaining_bit_count));
+    abs = abs_remaining_bits + (1 << remaining_bit_count) + 1;
+  }
+  if (abs != 0) {
+    const bool sign = static_cast<bool>(reader_.ReadBit());
+    const int scaled_abs = abs << scale;
+    const int reduced_delta = sign ? -scaled_abs : scaled_abs;
+    value += reduced_delta;
+    value = Clip3(value, min_value, max_value);
+  }
+  return value;
+}
+
+void Tile::ReadQuantizerIndexDelta(const Block& block) {
+  assert(read_deltas_);
+  BlockParameters& bp = *block.bp;
+  if ((block.size == SuperBlockSize() && bp.skip)) {
+    return;
+  }
+  current_quantizer_index_ =
+      ReadAndClipDelta(symbol_decoder_context_.delta_q_cdf, kDeltaQSmall,
+                       frame_header_.delta_q.scale, kMinLossyQuantizer,
+                       kMaxQuantizer, current_quantizer_index_);
+}
+
+void Tile::ReadLoopFilterDelta(const Block& block) {
+  assert(read_deltas_);
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.delta_lf.present ||
+      (block.size == SuperBlockSize() && bp.skip)) {
+    return;
+  }
+  int frame_lf_count = 1;
+  if (frame_header_.delta_lf.multi) {
+    frame_lf_count = kFrameLfCount - (PlaneCount() > 1 ? 0 : 2);
+  }
+  bool recompute_deblock_filter_levels = false;
+  for (int i = 0; i < frame_lf_count; ++i) {
+    uint16_t* const delta_lf_abs_cdf =
+        frame_header_.delta_lf.multi
+            ? symbol_decoder_context_.delta_lf_multi_cdf[i]
+            : symbol_decoder_context_.delta_lf_cdf;
+    const int8_t old_delta_lf = delta_lf_[i];
+    delta_lf_[i] = ReadAndClipDelta(
+        delta_lf_abs_cdf, kDeltaLfSmall, frame_header_.delta_lf.scale,
+        -kMaxLoopFilterValue, kMaxLoopFilterValue, delta_lf_[i]);
+    recompute_deblock_filter_levels =
+        recompute_deblock_filter_levels || (old_delta_lf != delta_lf_[i]);
+  }
+  delta_lf_all_zero_ =
+      (delta_lf_[0] | delta_lf_[1] | delta_lf_[2] | delta_lf_[3]) == 0;
+  if (!delta_lf_all_zero_ && recompute_deblock_filter_levels) {
+    post_filter_.ComputeDeblockFilterLevels(delta_lf_, deblock_filter_levels_);
+  }
+}
+
+void Tile::ReadPredictionModeY(const Block& block, bool intra_y_mode) {
+  uint16_t* cdf;
+  if (intra_y_mode) {
+    const PredictionMode top_mode =
+        block.top_available[kPlaneY] ? block.bp_top->y_mode : kPredictionModeDc;
+    const PredictionMode left_mode = block.left_available[kPlaneY]
+                                         ? block.bp_left->y_mode
+                                         : kPredictionModeDc;
+    const int top_context = kIntraYModeContext[top_mode];
+    const int left_context = kIntraYModeContext[left_mode];
+    cdf = symbol_decoder_context_
+              .intra_frame_y_mode_cdf[top_context][left_context];
+  } else {
+    cdf = symbol_decoder_context_.y_mode_cdf[kSizeGroup[block.size]];
+  }
+  block.bp->y_mode = static_cast<PredictionMode>(
+      reader_.ReadSymbol<kIntraPredictionModesY>(cdf));
+}
+
+void Tile::ReadIntraAngleInfo(const Block& block, PlaneType plane_type) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.angle_delta[plane_type] = 0;
+  const PredictionMode mode =
+      (plane_type == kPlaneTypeY) ? bp.y_mode : bp.uv_mode;
+  if (IsBlockSmallerThan8x8(block.size) || !IsDirectionalMode(mode)) return;
+  uint16_t* const cdf =
+      symbol_decoder_context_.angle_delta_cdf[mode - kPredictionModeVertical];
+  prediction_parameters.angle_delta[plane_type] =
+      reader_.ReadSymbol<kAngleDeltaSymbolCount>(cdf);
+  prediction_parameters.angle_delta[plane_type] -= kMaxAngleDelta;
+}
+
+void Tile::ReadCflAlpha(const Block& block) {
+  const int signs = reader_.ReadSymbol<kCflAlphaSignsSymbolCount>(
+      symbol_decoder_context_.cfl_alpha_signs_cdf);
+  const int8_t* const cfl_lookup = kCflAlphaLookup[signs];
+  const auto sign_u = static_cast<CflSign>(cfl_lookup[0]);
+  const auto sign_v = static_cast<CflSign>(cfl_lookup[1]);
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.cfl_alpha_u = 0;
+  if (sign_u != kCflSignZero) {
+    assert(cfl_lookup[2] >= 0);
+    prediction_parameters.cfl_alpha_u =
+        reader_.ReadSymbol<kCflAlphaSymbolCount>(
+            symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[2]]) +
+        1;
+    if (sign_u == kCflSignNegative) prediction_parameters.cfl_alpha_u *= -1;
+  }
+  prediction_parameters.cfl_alpha_v = 0;
+  if (sign_v != kCflSignZero) {
+    assert(cfl_lookup[3] >= 0);
+    prediction_parameters.cfl_alpha_v =
+        reader_.ReadSymbol<kCflAlphaSymbolCount>(
+            symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[3]]) +
+        1;
+    if (sign_v == kCflSignNegative) prediction_parameters.cfl_alpha_v *= -1;
+  }
+}
+
+void Tile::ReadPredictionModeUV(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bool chroma_from_luma_allowed;
+  if (frame_header_.segmentation.lossless[bp.segment_id]) {
+    chroma_from_luma_allowed = block.residual_size[kPlaneU] == kBlock4x4;
+  } else {
+    chroma_from_luma_allowed = IsBlockDimensionLessThan64(block.size);
+  }
+  uint16_t* const cdf =
+      symbol_decoder_context_
+          .uv_mode_cdf[static_cast<int>(chroma_from_luma_allowed)][bp.y_mode];
+  if (chroma_from_luma_allowed) {
+    bp.uv_mode = static_cast<PredictionMode>(
+        reader_.ReadSymbol<kIntraPredictionModesUV>(cdf));
+  } else {
+    bp.uv_mode = static_cast<PredictionMode>(
+        reader_.ReadSymbol<kIntraPredictionModesUV - 1>(cdf));
+  }
+}
+
+int Tile::ReadMotionVectorComponent(const Block& block, const int component) {
+  const int context =
+      static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+  const bool sign = reader_.ReadSymbol(
+      symbol_decoder_context_.mv_sign_cdf[component][context]);
+  const int mv_class = reader_.ReadSymbol<kMvClassSymbolCount>(
+      symbol_decoder_context_.mv_class_cdf[component][context]);
+  int magnitude = 1;
+  int value;
+  uint16_t* fraction_cdf;
+  uint16_t* precision_cdf;
+  if (mv_class == 0) {
+    value = static_cast<int>(reader_.ReadSymbol(
+        symbol_decoder_context_.mv_class0_bit_cdf[component][context]));
+    fraction_cdf = symbol_decoder_context_
+                       .mv_class0_fraction_cdf[component][context][value];
+    precision_cdf = symbol_decoder_context_
+                        .mv_class0_high_precision_cdf[component][context];
+  } else {
+    assert(mv_class <= kMvBitSymbolCount);
+    value = 0;
+    for (int i = 0; i < mv_class; ++i) {
+      const int bit = static_cast<int>(reader_.ReadSymbol(
+          symbol_decoder_context_.mv_bit_cdf[component][context][i]));
+      value |= bit << i;
+    }
+    magnitude += 2 << (mv_class + 2);
+    fraction_cdf = symbol_decoder_context_.mv_fraction_cdf[component][context];
+    precision_cdf =
+        symbol_decoder_context_.mv_high_precision_cdf[component][context];
+  }
+  const int fraction =
+      (frame_header_.force_integer_mv == 0)
+          ? reader_.ReadSymbol<kMvFractionSymbolCount>(fraction_cdf)
+          : 3;
+  const int precision =
+      frame_header_.allow_high_precision_mv
+          ? static_cast<int>(reader_.ReadSymbol(precision_cdf))
+          : 1;
+  magnitude += (value << 3) | (fraction << 1) | precision;
+  return sign ? -magnitude : magnitude;
+}
+
+void Tile::ReadMotionVector(const Block& block, int index) {
+  BlockParameters& bp = *block.bp;
+  const int context =
+      static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+  const auto mv_joint =
+      static_cast<MvJointType>(reader_.ReadSymbol<kNumMvJointTypes>(
+          symbol_decoder_context_.mv_joint_cdf[context]));
+  if (mv_joint == kMvJointTypeHorizontalZeroVerticalNonZero ||
+      mv_joint == kMvJointTypeNonZero) {
+    bp.mv.mv[index].mv[0] = ReadMotionVectorComponent(block, 0);
+  }
+  if (mv_joint == kMvJointTypeHorizontalNonZeroVerticalZero ||
+      mv_joint == kMvJointTypeNonZero) {
+    bp.mv.mv[index].mv[1] = ReadMotionVectorComponent(block, 1);
+  }
+}
+
+void Tile::ReadFilterIntraModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.use_filter_intra = false;
+  if (!sequence_header_.enable_filter_intra || bp.y_mode != kPredictionModeDc ||
+      bp.palette_mode_info.size[kPlaneTypeY] != 0 ||
+      !IsBlockDimensionLessThan64(block.size)) {
+    return;
+  }
+  prediction_parameters.use_filter_intra = reader_.ReadSymbol(
+      symbol_decoder_context_.use_filter_intra_cdf[block.size]);
+  if (prediction_parameters.use_filter_intra) {
+    prediction_parameters.filter_intra_mode = static_cast<FilterIntraPredictor>(
+        reader_.ReadSymbol<kNumFilterIntraPredictors>(
+            symbol_decoder_context_.filter_intra_mode_cdf));
+  }
+}
+
+bool Tile::DecodeIntraModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bp.skip = false;
+  if (frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadIntraSegmentId(block)) {
+    return false;
+  }
+  bp.skip_mode = false;
+  ReadSkip(block);
+  if (!frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadIntraSegmentId(block)) {
+    return false;
+  }
+  ReadCdef(block);
+  if (read_deltas_) {
+    ReadQuantizerIndexDelta(block);
+    ReadLoopFilterDelta(block);
+    read_deltas_ = false;
+  }
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.use_intra_block_copy = false;
+  if (frame_header_.allow_intrabc) {
+    prediction_parameters.use_intra_block_copy =
+        reader_.ReadSymbol(symbol_decoder_context_.intra_block_copy_cdf);
+  }
+  if (prediction_parameters.use_intra_block_copy) {
+    bp.is_inter = true;
+    bp.reference_frame[0] = kReferenceFrameIntra;
+    bp.reference_frame[1] = kReferenceFrameNone;
+    bp.y_mode = kPredictionModeDc;
+    bp.uv_mode = kPredictionModeDc;
+    prediction_parameters.motion_mode = kMotionModeSimple;
+    prediction_parameters.compound_prediction_type =
+        kCompoundPredictionTypeAverage;
+    bp.palette_mode_info.size[kPlaneTypeY] = 0;
+    bp.palette_mode_info.size[kPlaneTypeUV] = 0;
+    bp.interpolation_filter[0] = kInterpolationFilterBilinear;
+    bp.interpolation_filter[1] = kInterpolationFilterBilinear;
+    MvContexts dummy_mode_contexts;
+    FindMvStack(block, /*is_compound=*/false, &dummy_mode_contexts);
+    return AssignIntraMv(block);
+  }
+  bp.is_inter = false;
+  return ReadIntraBlockModeInfo(block, /*intra_y_mode=*/true);
+}
+
+int8_t Tile::ComputePredictedSegmentId(const Block& block) const {
+  // If prev_segment_ids_ is null, treat it as if it pointed to a segmentation
+  // map containing all 0s.
+  if (prev_segment_ids_ == nullptr) return 0;
+
+  const int x_limit = std::min(frame_header_.columns4x4 - block.column4x4,
+                               static_cast<int>(block.width4x4));
+  const int y_limit = std::min(frame_header_.rows4x4 - block.row4x4,
+                               static_cast<int>(block.height4x4));
+  int8_t id = 7;
+  for (int y = 0; y < y_limit; ++y) {
+    for (int x = 0; x < x_limit; ++x) {
+      const int8_t prev_segment_id =
+          prev_segment_ids_->segment_id(block.row4x4 + y, block.column4x4 + x);
+      id = std::min(id, prev_segment_id);
+    }
+  }
+  return id;
+}
+
+bool Tile::ReadInterSegmentId(const Block& block, bool pre_skip) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.segmentation.enabled) {
+    bp.segment_id = 0;
+    return true;
+  }
+  if (!frame_header_.segmentation.update_map) {
+    bp.segment_id = ComputePredictedSegmentId(block);
+    return true;
+  }
+  if (pre_skip) {
+    if (!frame_header_.segmentation.segment_id_pre_skip) {
+      bp.segment_id = 0;
+      return true;
+    }
+  } else if (bp.skip) {
+    bp.use_predicted_segment_id = false;
+    return ReadSegmentId(block);
+  }
+  if (frame_header_.segmentation.temporal_update) {
+    const int context =
+        (block.left_available[kPlaneY]
+             ? static_cast<int>(block.bp_left->use_predicted_segment_id)
+             : 0) +
+        (block.top_available[kPlaneY]
+             ? static_cast<int>(block.bp_top->use_predicted_segment_id)
+             : 0);
+    bp.use_predicted_segment_id = reader_.ReadSymbol(
+        symbol_decoder_context_.use_predicted_segment_id_cdf[context]);
+    if (bp.use_predicted_segment_id) {
+      bp.segment_id = ComputePredictedSegmentId(block);
+      return true;
+    }
+  }
+  return ReadSegmentId(block);
+}
+
+void Tile::ReadIsInter(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (bp.skip_mode) {
+    bp.is_inter = true;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureReferenceFrame)) {
+    bp.is_inter =
+        frame_header_.segmentation
+            .feature_data[bp.segment_id][kSegmentFeatureReferenceFrame] !=
+        kReferenceFrameIntra;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureGlobalMv)) {
+    bp.is_inter = true;
+    return;
+  }
+  int context = 0;
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    context = (block.IsTopIntra() && block.IsLeftIntra())
+                  ? 3
+                  : static_cast<int>(block.IsTopIntra() || block.IsLeftIntra());
+  } else if (block.top_available[kPlaneY] || block.left_available[kPlaneY]) {
+    context = 2 * static_cast<int>(block.top_available[kPlaneY]
+                                       ? block.IsTopIntra()
+                                       : block.IsLeftIntra());
+  }
+  bp.is_inter =
+      reader_.ReadSymbol(symbol_decoder_context_.is_inter_cdf[context]);
+}
+
+bool Tile::ReadIntraBlockModeInfo(const Block& block, bool intra_y_mode) {
+  BlockParameters& bp = *block.bp;
+  bp.reference_frame[0] = kReferenceFrameIntra;
+  bp.reference_frame[1] = kReferenceFrameNone;
+  ReadPredictionModeY(block, intra_y_mode);
+  ReadIntraAngleInfo(block, kPlaneTypeY);
+  if (block.HasChroma()) {
+    ReadPredictionModeUV(block);
+    if (bp.uv_mode == kPredictionModeChromaFromLuma) {
+      ReadCflAlpha(block);
+    }
+    ReadIntraAngleInfo(block, kPlaneTypeUV);
+  }
+  ReadPaletteModeInfo(block);
+  ReadFilterIntraModeInfo(block);
+  return true;
+}
+
+CompoundReferenceType Tile::ReadCompoundReferenceType(const Block& block) {
+  // compound and inter.
+  const bool top_comp_inter = block.top_available[kPlaneY] &&
+                              !block.IsTopIntra() && !block.IsTopSingle();
+  const bool left_comp_inter = block.left_available[kPlaneY] &&
+                               !block.IsLeftIntra() && !block.IsLeftSingle();
+  // unidirectional compound.
+  const bool top_uni_comp =
+      top_comp_inter && IsSameDirectionReferencePair(block.TopReference(0),
+                                                     block.TopReference(1));
+  const bool left_uni_comp =
+      left_comp_inter && IsSameDirectionReferencePair(block.LeftReference(0),
+                                                      block.LeftReference(1));
+  int context;
+  if (block.top_available[kPlaneY] && !block.IsTopIntra() &&
+      block.left_available[kPlaneY] && !block.IsLeftIntra()) {
+    const int same_direction = static_cast<int>(IsSameDirectionReferencePair(
+        block.TopReference(0), block.LeftReference(0)));
+    if (!top_comp_inter && !left_comp_inter) {
+      context = 1 + MultiplyBy2(same_direction);
+    } else if (!top_comp_inter) {
+      context = left_uni_comp ? 3 + same_direction : 1;
+    } else if (!left_comp_inter) {
+      context = top_uni_comp ? 3 + same_direction : 1;
+    } else {
+      if (!top_uni_comp && !left_uni_comp) {
+        context = 0;
+      } else if (!top_uni_comp || !left_uni_comp) {
+        context = 2;
+      } else {
+        context = 3 + static_cast<int>(
+                          (block.TopReference(0) == kReferenceFrameBackward) ==
+                          (block.LeftReference(0) == kReferenceFrameBackward));
+      }
+    }
+  } else if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    if (top_comp_inter) {
+      context = 1 + MultiplyBy2(static_cast<int>(top_uni_comp));
+    } else if (left_comp_inter) {
+      context = 1 + MultiplyBy2(static_cast<int>(left_uni_comp));
+    } else {
+      context = 2;
+    }
+  } else if (top_comp_inter) {
+    context = MultiplyBy4(static_cast<int>(top_uni_comp));
+  } else if (left_comp_inter) {
+    context = MultiplyBy4(static_cast<int>(left_uni_comp));
+  } else {
+    context = 2;
+  }
+  return static_cast<CompoundReferenceType>(reader_.ReadSymbol(
+      symbol_decoder_context_.compound_reference_type_cdf[context]));
+}
+
+template <bool is_single, bool is_backward, int index>
+uint16_t* Tile::GetReferenceCdf(
+    const Block& block,
+    CompoundReferenceType type /*= kNumCompoundReferenceTypes*/) {
+  int context = 0;
+  if ((type == kCompoundReferenceUnidirectional && index == 0) ||
+      (is_single && index == 1)) {
+    // uni_comp_ref and single_ref_p1.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameGolden,
+                            kReferenceFrameBackward, kReferenceFrameAlternate);
+  } else if (type == kCompoundReferenceUnidirectional && index == 1) {
+    // uni_comp_ref_p1.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast2, kReferenceFrameLast2,
+                            kReferenceFrameLast3, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceUnidirectional && index == 2) ||
+             (type == kCompoundReferenceBidirectional && index == 2) ||
+             (is_single && index == 5)) {
+    // uni_comp_ref_p2, comp_ref_p2 and single_ref_p5.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast3, kReferenceFrameLast3,
+                            kReferenceFrameGolden, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceBidirectional && index == 0) ||
+             (is_single && index == 3)) {
+    // comp_ref and single_ref_p3.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast2,
+                            kReferenceFrameLast3, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceBidirectional && index == 1) ||
+             (is_single && index == 4)) {
+    // comp_ref_p1 and single_ref_p4.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast,
+                            kReferenceFrameLast2, kReferenceFrameLast2);
+  } else if ((is_single && index == 2) || (is_backward && index == 0)) {
+    // single_ref_p2 and comp_bwdref.
+    context = GetReferenceContext(
+        block, kReferenceFrameBackward, kReferenceFrameAlternate2,
+        kReferenceFrameAlternate, kReferenceFrameAlternate);
+  } else if ((is_single && index == 6) || (is_backward && index == 1)) {
+    // single_ref_p6 and comp_bwdref_p1.
+    context = GetReferenceContext(
+        block, kReferenceFrameBackward, kReferenceFrameBackward,
+        kReferenceFrameAlternate2, kReferenceFrameAlternate2);
+  }
+  if (is_single) {
+    // The index parameter for single references is offset by one since the spec
+    // uses 1-based index for these elements.
+    return symbol_decoder_context_.single_reference_cdf[context][index - 1];
+  }
+  if (is_backward) {
+    return symbol_decoder_context_
+        .compound_backward_reference_cdf[context][index];
+  }
+  return symbol_decoder_context_.compound_reference_cdf[type][context][index];
+}
+
+void Tile::ReadReferenceFrames(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (bp.skip_mode) {
+    bp.reference_frame[0] = frame_header_.skip_mode_frame[0];
+    bp.reference_frame[1] = frame_header_.skip_mode_frame[1];
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureReferenceFrame)) {
+    bp.reference_frame[0] = static_cast<ReferenceFrameType>(
+        frame_header_.segmentation
+            .feature_data[bp.segment_id][kSegmentFeatureReferenceFrame]);
+    bp.reference_frame[1] = kReferenceFrameNone;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureGlobalMv)) {
+    bp.reference_frame[0] = kReferenceFrameLast;
+    bp.reference_frame[1] = kReferenceFrameNone;
+    return;
+  }
+  const bool use_compound_reference =
+      frame_header_.reference_mode_select &&
+      std::min(block.width4x4, block.height4x4) >= 2 &&
+      reader_.ReadSymbol(symbol_decoder_context_.use_compound_reference_cdf
+                             [GetUseCompoundReferenceContext(block)]);
+  if (use_compound_reference) {
+    CompoundReferenceType reference_type = ReadCompoundReferenceType(block);
+    if (reference_type == kCompoundReferenceUnidirectional) {
+      // uni_comp_ref.
+      if (reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 0>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameBackward;
+        bp.reference_frame[1] = kReferenceFrameAlternate;
+        return;
+      }
+      // uni_comp_ref_p1.
+      if (!reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 1>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameLast;
+        bp.reference_frame[1] = kReferenceFrameLast2;
+        return;
+      }
+      // uni_comp_ref_p2.
+      if (reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 2>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameLast;
+        bp.reference_frame[1] = kReferenceFrameGolden;
+        return;
+      }
+      bp.reference_frame[0] = kReferenceFrameLast;
+      bp.reference_frame[1] = kReferenceFrameLast3;
+      return;
+    }
+    assert(reference_type == kCompoundReferenceBidirectional);
+    // comp_ref.
+    if (reader_.ReadSymbol(
+            GetReferenceCdf<false, false, 0>(block, reference_type))) {
+      // comp_ref_p2.
+      bp.reference_frame[0] =
+          reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 2>(block, reference_type))
+              ? kReferenceFrameGolden
+              : kReferenceFrameLast3;
+    } else {
+      // comp_ref_p1.
+      bp.reference_frame[0] =
+          reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 1>(block, reference_type))
+              ? kReferenceFrameLast2
+              : kReferenceFrameLast;
+    }
+    // comp_bwdref.
+    if (reader_.ReadSymbol(GetReferenceCdf<false, true, 0>(block))) {
+      bp.reference_frame[1] = kReferenceFrameAlternate;
+    } else {
+      // comp_bwdref_p1.
+      bp.reference_frame[1] =
+          reader_.ReadSymbol(GetReferenceCdf<false, true, 1>(block))
+              ? kReferenceFrameAlternate2
+              : kReferenceFrameBackward;
+    }
+    return;
+  }
+  assert(!use_compound_reference);
+  bp.reference_frame[1] = kReferenceFrameNone;
+  // single_ref_p1.
+  if (reader_.ReadSymbol(GetReferenceCdf<true, false, 1>(block))) {
+    // single_ref_p2.
+    if (reader_.ReadSymbol(GetReferenceCdf<true, false, 2>(block))) {
+      bp.reference_frame[0] = kReferenceFrameAlternate;
+      return;
+    }
+    // single_ref_p6.
+    bp.reference_frame[0] =
+        reader_.ReadSymbol(GetReferenceCdf<true, false, 6>(block))
+            ? kReferenceFrameAlternate2
+            : kReferenceFrameBackward;
+    return;
+  }
+  // single_ref_p3.
+  if (reader_.ReadSymbol(GetReferenceCdf<true, false, 3>(block))) {
+    // single_ref_p5.
+    bp.reference_frame[0] =
+        reader_.ReadSymbol(GetReferenceCdf<true, false, 5>(block))
+            ? kReferenceFrameGolden
+            : kReferenceFrameLast3;
+    return;
+  }
+  // single_ref_p4.
+  bp.reference_frame[0] =
+      reader_.ReadSymbol(GetReferenceCdf<true, false, 4>(block))
+          ? kReferenceFrameLast2
+          : kReferenceFrameLast;
+}
+
+void Tile::ReadInterPredictionModeY(const Block& block,
+                                    const MvContexts& mode_contexts) {
+  BlockParameters& bp = *block.bp;
+  if (bp.skip_mode) {
+    bp.y_mode = kPredictionModeNearestNearestMv;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(bp.segment_id,
+                                               kSegmentFeatureGlobalMv)) {
+    bp.y_mode = kPredictionModeGlobalMv;
+    return;
+  }
+  if (bp.reference_frame[1] > kReferenceFrameIntra) {
+    const int idx0 = mode_contexts.reference_mv >> 1;
+    const int idx1 =
+        std::min(mode_contexts.new_mv, kCompoundModeNewMvContexts - 1);
+    const int context = kCompoundModeContextMap[idx0][idx1];
+    const int offset = reader_.ReadSymbol<kNumCompoundInterPredictionModes>(
+        symbol_decoder_context_.compound_prediction_mode_cdf[context]);
+    bp.y_mode =
+        static_cast<PredictionMode>(kPredictionModeNearestNearestMv + offset);
+    return;
+  }
+  // new_mv.
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_.new_mv_cdf[mode_contexts.new_mv])) {
+    bp.y_mode = kPredictionModeNewMv;
+    return;
+  }
+  // zero_mv.
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_.zero_mv_cdf[mode_contexts.zero_mv])) {
+    bp.y_mode = kPredictionModeGlobalMv;
+    return;
+  }
+  // ref_mv.
+  bp.y_mode =
+      reader_.ReadSymbol(
+          symbol_decoder_context_.reference_mv_cdf[mode_contexts.reference_mv])
+          ? kPredictionModeNearMv
+          : kPredictionModeNearestMv;
+}
+
+void Tile::ReadRefMvIndex(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.ref_mv_index = 0;
+  if (bp.y_mode != kPredictionModeNewMv &&
+      bp.y_mode != kPredictionModeNewNewMv &&
+      !kPredictionModeHasNearMvMask.Contains(bp.y_mode)) {
+    return;
+  }
+  const int start =
+      static_cast<int>(kPredictionModeHasNearMvMask.Contains(bp.y_mode));
+  prediction_parameters.ref_mv_index = start;
+  for (int i = start; i < start + 2; ++i) {
+    if (prediction_parameters.ref_mv_count <= i + 1) break;
+    // drl_mode in the spec.
+    const bool ref_mv_index_bit = reader_.ReadSymbol(
+        symbol_decoder_context_.ref_mv_index_cdf[GetRefMvIndexContext(
+            prediction_parameters.nearest_mv_count, i)]);
+    prediction_parameters.ref_mv_index = i + static_cast<int>(ref_mv_index_bit);
+    if (!ref_mv_index_bit) return;
+  }
+}
+
+void Tile::ReadInterIntraMode(const Block& block, bool is_compound) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+  prediction_parameters.is_wedge_inter_intra = false;
+  if (bp.skip_mode || !sequence_header_.enable_interintra_compound ||
+      is_compound || !kIsInterIntraModeAllowedMask.Contains(block.size)) {
+    return;
+  }
+  // kSizeGroup[block.size] is guaranteed to be non-zero because of the block
+  // size constraint enforced in the above condition.
+  assert(kSizeGroup[block.size] - 1 >= 0);
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_
+              .is_inter_intra_cdf[kSizeGroup[block.size] - 1])) {
+    prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+    return;
+  }
+  prediction_parameters.inter_intra_mode =
+      static_cast<InterIntraMode>(reader_.ReadSymbol<kNumInterIntraModes>(
+          symbol_decoder_context_
+              .inter_intra_mode_cdf[kSizeGroup[block.size] - 1]));
+  bp.reference_frame[1] = kReferenceFrameIntra;
+  prediction_parameters.angle_delta[kPlaneTypeY] = 0;
+  prediction_parameters.angle_delta[kPlaneTypeUV] = 0;
+  prediction_parameters.use_filter_intra = false;
+  prediction_parameters.is_wedge_inter_intra = reader_.ReadSymbol(
+      symbol_decoder_context_.is_wedge_inter_intra_cdf[block.size]);
+  if (!prediction_parameters.is_wedge_inter_intra) return;
+  prediction_parameters.wedge_index =
+      reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+          symbol_decoder_context_.wedge_index_cdf[block.size]);
+  prediction_parameters.wedge_sign = 0;
+}
+
+void Tile::ReadMotionMode(const Block& block, bool is_compound) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  const auto global_motion_type =
+      frame_header_.global_motion[bp.reference_frame[0]].type;
+  if (bp.skip_mode || !frame_header_.is_motion_mode_switchable ||
+      IsBlockDimension4(block.size) ||
+      (frame_header_.force_integer_mv == 0 &&
+       (bp.y_mode == kPredictionModeGlobalMv ||
+        bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+       global_motion_type > kGlobalMotionTransformationTypeTranslation) ||
+      is_compound || bp.reference_frame[1] == kReferenceFrameIntra ||
+      !block.HasOverlappableCandidates()) {
+    prediction_parameters.motion_mode = kMotionModeSimple;
+    return;
+  }
+  prediction_parameters.num_warp_samples = 0;
+  int num_samples_scanned = 0;
+  memset(prediction_parameters.warp_estimate_candidates, 0,
+         sizeof(prediction_parameters.warp_estimate_candidates));
+  FindWarpSamples(block, &prediction_parameters.num_warp_samples,
+                  &num_samples_scanned,
+                  prediction_parameters.warp_estimate_candidates);
+  if (frame_header_.force_integer_mv != 0 ||
+      prediction_parameters.num_warp_samples == 0 ||
+      !frame_header_.allow_warped_motion || IsScaled(bp.reference_frame[0])) {
+    prediction_parameters.motion_mode =
+        reader_.ReadSymbol(symbol_decoder_context_.use_obmc_cdf[block.size])
+            ? kMotionModeObmc
+            : kMotionModeSimple;
+    return;
+  }
+  prediction_parameters.motion_mode =
+      static_cast<MotionMode>(reader_.ReadSymbol<kNumMotionModes>(
+          symbol_decoder_context_.motion_mode_cdf[block.size]));
+}
+
+uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) {
+  int context = 0;
+  if (block.top_available[kPlaneY]) {
+    if (!block.IsTopSingle()) {
+      context += static_cast<int>(block.bp_top->is_explicit_compound_type);
+    } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+      context += 3;
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    if (!block.IsLeftSingle()) {
+      context += static_cast<int>(block.bp_left->is_explicit_compound_type);
+    } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+      context += 3;
+    }
+  }
+  return symbol_decoder_context_.is_explicit_compound_type_cdf[std::min(
+      context, kIsExplicitCompoundTypeContexts - 1)];
+}
+
+uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) {
+  const BlockParameters& bp = *block.bp;
+  const ReferenceInfo& reference_info = *current_frame_.reference_info();
+  const int forward =
+      std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]);
+  const int backward =
+      std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]);
+  int context = (forward == backward) ? 3 : 0;
+  if (block.top_available[kPlaneY]) {
+    if (!block.IsTopSingle()) {
+      context += static_cast<int>(block.bp_top->is_compound_type_average);
+    } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+      ++context;
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    if (!block.IsLeftSingle()) {
+      context += static_cast<int>(block.bp_left->is_compound_type_average);
+    } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+      ++context;
+    }
+  }
+  return symbol_decoder_context_.is_compound_type_average_cdf[context];
+}
+
+void Tile::ReadCompoundType(const Block& block, bool is_compound) {
+  BlockParameters& bp = *block.bp;
+  bp.is_explicit_compound_type = false;
+  bp.is_compound_type_average = true;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  if (bp.skip_mode) {
+    prediction_parameters.compound_prediction_type =
+        kCompoundPredictionTypeAverage;
+    return;
+  }
+  if (is_compound) {
+    if (sequence_header_.enable_masked_compound) {
+      bp.is_explicit_compound_type =
+          reader_.ReadSymbol(GetIsExplicitCompoundTypeCdf(block));
+    }
+    if (bp.is_explicit_compound_type) {
+      if (kIsWedgeCompoundModeAllowed.Contains(block.size)) {
+        // Only kCompoundPredictionTypeWedge and
+        // kCompoundPredictionTypeDiffWeighted are signaled explicitly.
+        prediction_parameters.compound_prediction_type =
+            static_cast<CompoundPredictionType>(reader_.ReadSymbol(
+                symbol_decoder_context_.compound_type_cdf[block.size]));
+      } else {
+        prediction_parameters.compound_prediction_type =
+            kCompoundPredictionTypeDiffWeighted;
+      }
+    } else {
+      if (sequence_header_.enable_jnt_comp) {
+        bp.is_compound_type_average =
+            reader_.ReadSymbol(GetIsCompoundTypeAverageCdf(block));
+        prediction_parameters.compound_prediction_type =
+            bp.is_compound_type_average ? kCompoundPredictionTypeAverage
+                                        : kCompoundPredictionTypeDistance;
+      } else {
+        prediction_parameters.compound_prediction_type =
+            kCompoundPredictionTypeAverage;
+        return;
+      }
+    }
+    if (prediction_parameters.compound_prediction_type ==
+        kCompoundPredictionTypeWedge) {
+      prediction_parameters.wedge_index =
+          reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+              symbol_decoder_context_.wedge_index_cdf[block.size]);
+      prediction_parameters.wedge_sign = static_cast<int>(reader_.ReadBit());
+    } else if (prediction_parameters.compound_prediction_type ==
+               kCompoundPredictionTypeDiffWeighted) {
+      prediction_parameters.mask_is_inverse =
+          static_cast<bool>(reader_.ReadBit());
+    }
+    return;
+  }
+  if (prediction_parameters.inter_intra_mode != kNumInterIntraModes) {
+    prediction_parameters.compound_prediction_type =
+        prediction_parameters.is_wedge_inter_intra
+            ? kCompoundPredictionTypeWedge
+            : kCompoundPredictionTypeIntra;
+    return;
+  }
+  prediction_parameters.compound_prediction_type =
+      kCompoundPredictionTypeAverage;
+}
+
+uint16_t* Tile::GetInterpolationFilterCdf(const Block& block, int direction) {
+  const BlockParameters& bp = *block.bp;
+  int context = MultiplyBy8(direction) +
+                MultiplyBy4(static_cast<int>(bp.reference_frame[1] >
+                                             kReferenceFrameIntra));
+  int top_type = kNumExplicitInterpolationFilters;
+  if (block.top_available[kPlaneY]) {
+    if (block.bp_top->reference_frame[0] == bp.reference_frame[0] ||
+        block.bp_top->reference_frame[1] == bp.reference_frame[0]) {
+      top_type = block.bp_top->interpolation_filter[direction];
+    }
+  }
+  int left_type = kNumExplicitInterpolationFilters;
+  if (block.left_available[kPlaneY]) {
+    if (block.bp_left->reference_frame[0] == bp.reference_frame[0] ||
+        block.bp_left->reference_frame[1] == bp.reference_frame[0]) {
+      left_type = block.bp_left->interpolation_filter[direction];
+    }
+  }
+  if (left_type == top_type) {
+    context += left_type;
+  } else if (left_type == kNumExplicitInterpolationFilters) {
+    context += top_type;
+  } else if (top_type == kNumExplicitInterpolationFilters) {
+    context += left_type;
+  } else {
+    context += kNumExplicitInterpolationFilters;
+  }
+  return symbol_decoder_context_.interpolation_filter_cdf[context];
+}
+
+void Tile::ReadInterpolationFilter(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.interpolation_filter != kInterpolationFilterSwitchable) {
+    static_assert(
+        sizeof(bp.interpolation_filter) / sizeof(bp.interpolation_filter[0]) ==
+            2,
+        "Interpolation filter array size is not 2");
+    for (auto& interpolation_filter : bp.interpolation_filter) {
+      interpolation_filter = frame_header_.interpolation_filter;
+    }
+    return;
+  }
+  bool interpolation_filter_present = true;
+  if (bp.skip_mode ||
+      block.bp->prediction_parameters->motion_mode == kMotionModeLocalWarp) {
+    interpolation_filter_present = false;
+  } else if (!IsBlockDimension4(block.size) &&
+             bp.y_mode == kPredictionModeGlobalMv) {
+    interpolation_filter_present =
+        frame_header_.global_motion[bp.reference_frame[0]].type ==
+        kGlobalMotionTransformationTypeTranslation;
+  } else if (!IsBlockDimension4(block.size) &&
+             bp.y_mode == kPredictionModeGlobalGlobalMv) {
+    interpolation_filter_present =
+        frame_header_.global_motion[bp.reference_frame[0]].type ==
+            kGlobalMotionTransformationTypeTranslation ||
+        frame_header_.global_motion[bp.reference_frame[1]].type ==
+            kGlobalMotionTransformationTypeTranslation;
+  }
+  for (int i = 0; i < (sequence_header_.enable_dual_filter ? 2 : 1); ++i) {
+    bp.interpolation_filter[i] =
+        interpolation_filter_present
+            ? static_cast<InterpolationFilter>(
+                  reader_.ReadSymbol<kNumExplicitInterpolationFilters>(
+                      GetInterpolationFilterCdf(block, i)))
+            : kInterpolationFilterEightTap;
+  }
+  if (!sequence_header_.enable_dual_filter) {
+    bp.interpolation_filter[1] = bp.interpolation_filter[0];
+  }
+}
+
+bool Tile::ReadInterBlockModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bp.palette_mode_info.size[kPlaneTypeY] = 0;
+  bp.palette_mode_info.size[kPlaneTypeUV] = 0;
+  ReadReferenceFrames(block);
+  const bool is_compound = bp.reference_frame[1] > kReferenceFrameIntra;
+  MvContexts mode_contexts;
+  FindMvStack(block, is_compound, &mode_contexts);
+  ReadInterPredictionModeY(block, mode_contexts);
+  ReadRefMvIndex(block);
+  if (!AssignInterMv(block, is_compound)) return false;
+  ReadInterIntraMode(block, is_compound);
+  ReadMotionMode(block, is_compound);
+  ReadCompoundType(block, is_compound);
+  ReadInterpolationFilter(block);
+  return true;
+}
+
+bool Tile::DecodeInterModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  block.bp->prediction_parameters->use_intra_block_copy = false;
+  bp.skip = false;
+  if (!ReadInterSegmentId(block, /*pre_skip=*/true)) return false;
+  ReadSkipMode(block);
+  if (bp.skip_mode) {
+    bp.skip = true;
+  } else {
+    ReadSkip(block);
+  }
+  if (!frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadInterSegmentId(block, /*pre_skip=*/false)) {
+    return false;
+  }
+  ReadCdef(block);
+  if (read_deltas_) {
+    ReadQuantizerIndexDelta(block);
+    ReadLoopFilterDelta(block);
+    read_deltas_ = false;
+  }
+  ReadIsInter(block);
+  return bp.is_inter ? ReadInterBlockModeInfo(block)
+                     : ReadIntraBlockModeInfo(block, /*intra_y_mode=*/false);
+}
+
+bool Tile::DecodeModeInfo(const Block& block) {
+  return IsIntraFrame(frame_header_.frame_type) ? DecodeIntraModeInfo(block)
+                                                : DecodeInterModeInfo(block);
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/palette.cc b/src/tile/bitstream/palette.cc
new file mode 100644
index 0000000..674d210
--- /dev/null
+++ b/src/tile/bitstream/palette.cc
@@ -0,0 +1,319 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+int Tile::GetPaletteCache(const Block& block, PlaneType plane_type,
+                          uint16_t* const cache) {
+  const int top_size =
+      (block.top_available[kPlaneY] && Mod64(MultiplyBy4(block.row4x4)) != 0)
+          ? block.bp_top->palette_mode_info.size[plane_type]
+          : 0;
+  const int left_size = block.left_available[kPlaneY]
+                            ? block.bp_left->palette_mode_info.size[plane_type]
+                            : 0;
+  if (left_size == 0 && top_size == 0) return 0;
+  // Merge the left and top colors in sorted order and store them in |cache|.
+  uint16_t dummy[1];
+  const uint16_t* top = (top_size > 0)
+                            ? block.bp_top->palette_mode_info.color[plane_type]
+                            : dummy;
+  const uint16_t* left =
+      (left_size > 0) ? block.bp_left->palette_mode_info.color[plane_type]
+                      : dummy;
+  std::merge(top, top + top_size, left, left + left_size, cache);
+  // Deduplicate the entries in |cache| and return the number of unique
+  // entries.
+  return static_cast<int>(
+      std::distance(cache, std::unique(cache, cache + left_size + top_size)));
+}
+
+void Tile::ReadPaletteColors(const Block& block, Plane plane) {
+  const PlaneType plane_type = GetPlaneType(plane);
+  uint16_t cache[2 * kMaxPaletteSize];
+  const int n = GetPaletteCache(block, plane_type, cache);
+  BlockParameters& bp = *block.bp;
+  const uint8_t palette_size = bp.palette_mode_info.size[plane_type];
+  uint16_t* const palette_color = bp.palette_mode_info.color[plane];
+  const int8_t bitdepth = sequence_header_.color_config.bitdepth;
+  int index = 0;
+  for (int i = 0; i < n && index < palette_size; ++i) {
+    if (reader_.ReadBit() != 0) {  // use_palette_color_cache.
+      palette_color[index++] = cache[i];
+    }
+  }
+  const int merge_pivot = index;
+  if (index < palette_size) {
+    palette_color[index++] =
+        static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+  }
+  const int max_value = (1 << bitdepth) - 1;
+  if (index < palette_size) {
+    int bits = bitdepth - 3 + static_cast<int>(reader_.ReadLiteral(2));
+    do {
+      const int delta = static_cast<int>(reader_.ReadLiteral(bits)) +
+                        (plane_type == kPlaneTypeY ? 1 : 0);
+      palette_color[index] =
+          std::min(palette_color[index - 1] + delta, max_value);
+      if (palette_color[index] + (plane_type == kPlaneTypeY ? 1 : 0) >=
+          max_value) {
+        // Once the color exceeds max_value, all others can be set to max_value
+        // (since they are computed as a delta on top of the current color and
+        // then clipped).
+        Memset(&palette_color[index + 1], max_value, palette_size - index - 1);
+        break;
+      }
+      const int range = (1 << bitdepth) - palette_color[index] -
+                        (plane_type == kPlaneTypeY ? 1 : 0);
+      bits = std::min(bits, CeilLog2(range));
+    } while (++index < palette_size);
+  }
+  // Palette colors are generated using two ascending arrays. So sorting them is
+  // simply a matter of merging the two sorted portions of the array.
+  std::inplace_merge(palette_color, palette_color + merge_pivot,
+                     palette_color + palette_size);
+  if (plane_type == kPlaneTypeUV) {
+    uint16_t* const palette_color_v = bp.palette_mode_info.color[kPlaneV];
+    if (reader_.ReadBit() != 0) {  // delta_encode_palette_colors_v.
+      const int bits = bitdepth - 4 + static_cast<int>(reader_.ReadLiteral(2));
+      palette_color_v[0] = reader_.ReadLiteral(bitdepth);
+      for (int i = 1; i < palette_size; ++i) {
+        int delta = static_cast<int>(reader_.ReadLiteral(bits));
+        if (delta != 0 && reader_.ReadBit() != 0) delta = -delta;
+        // This line is equivalent to the following lines in the spec:
+        // val = palette_colors_v[ idx - 1 ] + palette_delta_v
+        // if ( val < 0 ) val += maxVal
+        // if ( val >= maxVal ) val -= maxVal
+        // palette_colors_v[ idx ] = Clip1( val )
+        //
+        // The difference is that in the code, max_value is (1 << bitdepth) - 1.
+        // So "& max_value" has the desired effect of computing both the "if"
+        // conditions and the Clip.
+        palette_color_v[i] = (palette_color_v[i - 1] + delta) & max_value;
+      }
+    } else {
+      for (int i = 0; i < palette_size; ++i) {
+        palette_color_v[i] =
+            static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+      }
+    }
+  }
+}
+
+void Tile::ReadPaletteModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
+      !frame_header_.allow_screen_content_tools) {
+    bp.palette_mode_info.size[kPlaneTypeY] = 0;
+    bp.palette_mode_info.size[kPlaneTypeUV] = 0;
+    return;
+  }
+  const int block_size_context =
+      k4x4WidthLog2[block.size] + k4x4HeightLog2[block.size] - 2;
+  if (bp.y_mode == kPredictionModeDc) {
+    const int context =
+        static_cast<int>(block.top_available[kPlaneY] &&
+                         block.bp_top->palette_mode_info.size[kPlaneTypeY] >
+                             0) +
+        static_cast<int>(block.left_available[kPlaneY] &&
+                         block.bp_left->palette_mode_info.size[kPlaneTypeY] >
+                             0);
+    const bool has_palette_y = reader_.ReadSymbol(
+        symbol_decoder_context_.has_palette_y_cdf[block_size_context][context]);
+    if (has_palette_y) {
+      bp.palette_mode_info.size[kPlaneTypeY] =
+          kMinPaletteSize +
+          reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+              symbol_decoder_context_.palette_y_size_cdf[block_size_context]);
+      ReadPaletteColors(block, kPlaneY);
+    }
+  }
+  if (bp.uv_mode == kPredictionModeDc && block.HasChroma()) {
+    const int context =
+        static_cast<int>(bp.palette_mode_info.size[kPlaneTypeY] > 0);
+    const bool has_palette_uv =
+        reader_.ReadSymbol(symbol_decoder_context_.has_palette_uv_cdf[context]);
+    if (has_palette_uv) {
+      bp.palette_mode_info.size[kPlaneTypeUV] =
+          kMinPaletteSize +
+          reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+              symbol_decoder_context_.palette_uv_size_cdf[block_size_context]);
+      ReadPaletteColors(block, kPlaneU);
+    }
+  }
+}
+
+void Tile::PopulatePaletteColorContexts(
+    const Block& block, PlaneType plane_type, int i, int start, int end,
+    uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+    uint8_t color_context[kMaxPaletteSquare]) {
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int column = start, counter = 0; column >= end; --column, ++counter) {
+    const int row = i - column;
+    assert(row > 0 || column > 0);
+    const uint8_t top =
+        (row > 0)
+            ? prediction_parameters.color_index_map[plane_type][row - 1][column]
+            : 0;
+    const uint8_t left =
+        (column > 0)
+            ? prediction_parameters.color_index_map[plane_type][row][column - 1]
+            : 0;
+    uint8_t index_mask;
+    static_assert(kMaxPaletteSize <= 8, "");
+    int index;
+    if (column <= 0) {
+      color_context[counter] = 0;
+      color_order[counter][0] = top;
+      index_mask = 1 << top;
+      index = 1;
+    } else if (row <= 0) {
+      color_context[counter] = 0;
+      color_order[counter][0] = left;
+      index_mask = 1 << left;
+      index = 1;
+    } else {
+      const uint8_t top_left =
+          prediction_parameters
+              .color_index_map[plane_type][row - 1][column - 1];
+      index_mask = (1 << top) | (1 << left) | (1 << top_left);
+      if (top == left && top == top_left) {
+        color_context[counter] = 4;
+        color_order[counter][0] = top;
+        index = 1;
+      } else if (top == left) {
+        color_context[counter] = 3;
+        color_order[counter][0] = top;
+        color_order[counter][1] = top_left;
+        index = 2;
+      } else if (top == top_left) {
+        color_context[counter] = 2;
+        color_order[counter][0] = top_left;
+        color_order[counter][1] = left;
+        index = 2;
+      } else if (left == top_left) {
+        color_context[counter] = 2;
+        color_order[counter][0] = top_left;
+        color_order[counter][1] = top;
+        index = 2;
+      } else {
+        color_context[counter] = 1;
+        color_order[counter][0] = std::min(top, left);
+        color_order[counter][1] = std::max(top, left);
+        color_order[counter][2] = top_left;
+        index = 3;
+      }
+    }
+    // Even though only the first |palette_size| entries of this array are ever
+    // used, it is faster to populate all 8 because of the vectorization of the
+    // constant sized loop.
+    for (uint8_t j = 0; j < kMaxPaletteSize; ++j) {
+      if (BitMaskSet::MaskContainsValue(index_mask, j)) continue;
+      color_order[counter][index++] = j;
+    }
+  }
+}
+
+bool Tile::ReadPaletteTokens(const Block& block) {
+  const PaletteModeInfo& palette_mode_info = block.bp->palette_mode_info;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int plane_type = kPlaneTypeY;
+       plane_type < (block.HasChroma() ? kNumPlaneTypes : kPlaneTypeUV);
+       ++plane_type) {
+    const int palette_size = palette_mode_info.size[plane_type];
+    if (palette_size == 0) continue;
+    int block_height = block.height;
+    int block_width = block.width;
+    int screen_height = std::min(
+        block_height, MultiplyBy4(frame_header_.rows4x4 - block.row4x4));
+    int screen_width = std::min(
+        block_width, MultiplyBy4(frame_header_.columns4x4 - block.column4x4));
+    if (plane_type == kPlaneTypeUV) {
+      block_height >>= sequence_header_.color_config.subsampling_y;
+      block_width >>= sequence_header_.color_config.subsampling_x;
+      screen_height >>= sequence_header_.color_config.subsampling_y;
+      screen_width >>= sequence_header_.color_config.subsampling_x;
+      if (block_height < 4) {
+        block_height += 2;
+        screen_height += 2;
+      }
+      if (block_width < 4) {
+        block_width += 2;
+        screen_width += 2;
+      }
+    }
+    if (!prediction_parameters.color_index_map[plane_type].Reset(
+            block_height, block_width, /*zero_initialize=*/false)) {
+      return false;
+    }
+    int first_value = 0;
+    reader_.DecodeUniform(palette_size, &first_value);
+    prediction_parameters.color_index_map[plane_type][0][0] = first_value;
+    for (int i = 1; i < screen_height + screen_width - 1; ++i) {
+      const int start = std::min(i, screen_width - 1);
+      const int end = std::max(0, i - screen_height + 1);
+      uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize];
+      uint8_t color_context[kMaxPaletteSquare];
+      PopulatePaletteColorContexts(block, static_cast<PlaneType>(plane_type), i,
+                                   start, end, color_order, color_context);
+      for (int j = start, counter = 0; j >= end; --j, ++counter) {
+        uint16_t* const cdf =
+            symbol_decoder_context_
+                .palette_color_index_cdf[plane_type]
+                                        [palette_size - kMinPaletteSize]
+                                        [color_context[counter]];
+        const int color_order_index = reader_.ReadSymbol(cdf, palette_size);
+        prediction_parameters.color_index_map[plane_type][i - j][j] =
+            color_order[counter][color_order_index];
+      }
+    }
+    if (screen_width < block_width) {
+      for (int i = 0; i < screen_height; ++i) {
+        memset(
+            &prediction_parameters.color_index_map[plane_type][i][screen_width],
+            prediction_parameters
+                .color_index_map[plane_type][i][screen_width - 1],
+            block_width - screen_width);
+      }
+    }
+    for (int i = screen_height; i < block_height; ++i) {
+      memcpy(
+          prediction_parameters.color_index_map[plane_type][i],
+          prediction_parameters.color_index_map[plane_type][screen_height - 1],
+          block_width);
+    }
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/partition.cc b/src/tile/bitstream/partition.cc
new file mode 100644
index 0000000..f3dbbb0
--- /dev/null
+++ b/src/tile/bitstream/partition.cc
@@ -0,0 +1,148 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+uint16_t PartitionCdfGatherHorizontalAlike(const uint16_t* const partition_cdf,
+                                           BlockSize block_size) {
+  // The spec computes the cdf value using the following formula (not writing
+  // partition_cdf[] and using short forms for partition names for clarity):
+  //   cdf = None - H + V - S + S - HTS + HTS - HBS + HBS - VLS;
+  //   if (block_size != 128x128) {
+  //     cdf += VRS - H4;
+  //   }
+  // After canceling out the repeated terms with opposite signs, we have:
+  //   cdf = None - H + V - VLS;
+  //   if (block_size != 128x128) {
+  //     cdf += VRS - H4;
+  //   }
+  uint16_t cdf = partition_cdf[kPartitionNone] -
+                 partition_cdf[kPartitionHorizontal] +
+                 partition_cdf[kPartitionVertical] -
+                 partition_cdf[kPartitionVerticalWithLeftSplit];
+  if (block_size != kBlock128x128) {
+    cdf += partition_cdf[kPartitionVerticalWithRightSplit] -
+           partition_cdf[kPartitionHorizontal4];
+  }
+  return cdf;
+}
+
+uint16_t PartitionCdfGatherVerticalAlike(const uint16_t* const partition_cdf,
+                                         BlockSize block_size) {
+  // The spec computes the cdf value using the following formula (not writing
+  // partition_cdf[] and using short forms for partition names for clarity):
+  //   cdf = H - V + V - S + HBS - VLS + VLS - VRS + S - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4 - V4;
+  //   }
+  // V4 is always zero. So, after canceling out the repeated terms with opposite
+  // signs, we have:
+  //   cdf = H + HBS - VRS - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4;
+  //   }
+  // VRS is zero for 128x128 blocks. So, further simplifying we have:
+  //   cdf = H + HBS - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4 - VRS;
+  //   }
+  uint16_t cdf = partition_cdf[kPartitionHorizontal] +
+                 partition_cdf[kPartitionHorizontalWithBottomSplit] -
+                 partition_cdf[kPartitionHorizontalWithTopSplit];
+  if (block_size != kBlock128x128) {
+    cdf += partition_cdf[kPartitionHorizontal4] -
+           partition_cdf[kPartitionVerticalWithRightSplit];
+  }
+  return cdf;
+}
+
+}  // namespace
+
+uint16_t* Tile::GetPartitionCdf(int row4x4, int column4x4,
+                                BlockSize block_size) {
+  const int block_size_log2 = k4x4WidthLog2[block_size];
+  int top = 0;
+  if (IsTopInside(row4x4)) {
+    top = static_cast<int>(
+        k4x4WidthLog2[block_parameters_holder_.Find(row4x4 - 1, column4x4)
+                          ->size] < block_size_log2);
+  }
+  int left = 0;
+  if (IsLeftInside(column4x4)) {
+    left = static_cast<int>(
+        k4x4HeightLog2[block_parameters_holder_.Find(row4x4, column4x4 - 1)
+                           ->size] < block_size_log2);
+  }
+  const int context = left * 2 + top;
+  return symbol_decoder_context_.partition_cdf[block_size_log2 - 1][context];
+}
+
+bool Tile::ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+                         bool has_rows, bool has_columns,
+                         Partition* const partition) {
+  if (IsBlockSmallerThan8x8(block_size)) {
+    *partition = kPartitionNone;
+    return true;
+  }
+  if (!has_rows && !has_columns) {
+    *partition = kPartitionSplit;
+    return true;
+  }
+  uint16_t* const partition_cdf =
+      GetPartitionCdf(row4x4, column4x4, block_size);
+  if (partition_cdf == nullptr) {
+    return false;
+  }
+  if (has_rows && has_columns) {
+    const int bsize_log2 = k4x4WidthLog2[block_size];
+    // The partition block size should be 8x8 or above.
+    assert(bsize_log2 > 0);
+    if (bsize_log2 == 1) {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kPartitionSplit + 1>(partition_cdf));
+    } else if (bsize_log2 == 5) {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kPartitionVerticalWithRightSplit + 1>(
+              partition_cdf));
+    } else {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kMaxPartitionTypes>(partition_cdf));
+    }
+  } else if (has_columns) {
+    const uint16_t cdf =
+        PartitionCdfGatherVerticalAlike(partition_cdf, block_size);
+    *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+                                                         : kPartitionHorizontal;
+  } else {
+    const uint16_t cdf =
+        PartitionCdfGatherHorizontalAlike(partition_cdf, block_size);
+    *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+                                                         : kPartitionVertical;
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/transform_size.cc b/src/tile/bitstream/transform_size.cc
new file mode 100644
index 0000000..b79851d
--- /dev/null
+++ b/src/tile/bitstream/transform_size.cc
@@ -0,0 +1,222 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t kMaxVariableTransformTreeDepth = 2;
+// Max_Tx_Depth array from section 5.11.5 in the spec with the following
+// modification: If the element is not zero, it is subtracted by one. That is
+// the only way in which this array is being used.
+constexpr int kTxDepthCdfIndex[kMaxBlockSizes] = {
+    0, 0, 1, 0, 0, 1, 2, 1, 1, 1, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3};
+
+constexpr TransformSize kMaxTransformSizeRectangle[kMaxBlockSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+    kTransformSize64x64};
+
+TransformSize GetSquareTransformSize(uint8_t pixels) {
+  switch (pixels) {
+    case 128:
+    case 64:
+      return kTransformSize64x64;
+    case 32:
+      return kTransformSize32x32;
+    case 16:
+      return kTransformSize16x16;
+    case 8:
+      return kTransformSize8x8;
+    default:
+      return kTransformSize4x4;
+  }
+}
+
+}  // namespace
+
+int Tile::GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+                               bool ignore_skip) {
+  if (row4x4 == block.row4x4) {
+    if (!block.top_available[kPlaneY]) return 64;
+    const BlockParameters& bp_top =
+        *block_parameters_holder_.Find(row4x4 - 1, column4x4);
+    if ((ignore_skip || bp_top.skip) && bp_top.is_inter) {
+      return kBlockWidthPixels[bp_top.size];
+    }
+  }
+  return kTransformWidth[inter_transform_sizes_[row4x4 - 1][column4x4]];
+}
+
+int Tile::GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+                                 bool ignore_skip) {
+  if (column4x4 == block.column4x4) {
+    if (!block.left_available[kPlaneY]) return 64;
+    const BlockParameters& bp_left =
+        *block_parameters_holder_.Find(row4x4, column4x4 - 1);
+    if ((ignore_skip || bp_left.skip) && bp_left.is_inter) {
+      return kBlockHeightPixels[bp_left.size];
+    }
+  }
+  return kTransformHeight[inter_transform_sizes_[row4x4][column4x4 - 1]];
+}
+
+TransformSize Tile::ReadFixedTransformSize(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.segmentation.lossless[bp.segment_id]) {
+    return kTransformSize4x4;
+  }
+  const TransformSize max_rect_tx_size = kMaxTransformSizeRectangle[block.size];
+  const bool allow_select = !bp.skip || !bp.is_inter;
+  if (block.size == kBlock4x4 || !allow_select ||
+      frame_header_.tx_mode != kTxModeSelect) {
+    return max_rect_tx_size;
+  }
+  const int max_tx_width = kTransformWidth[max_rect_tx_size];
+  const int max_tx_height = kTransformHeight[max_rect_tx_size];
+  const int top_width =
+      block.top_available[kPlaneY]
+          ? GetTopTransformWidth(block, block.row4x4, block.column4x4, true)
+          : 0;
+  const int left_height =
+      block.left_available[kPlaneY]
+          ? GetLeftTransformHeight(block, block.row4x4, block.column4x4, true)
+          : 0;
+  const auto context = static_cast<int>(top_width >= max_tx_width) +
+                       static_cast<int>(left_height >= max_tx_height);
+  const int cdf_index = kTxDepthCdfIndex[block.size];
+  uint16_t* const cdf =
+      symbol_decoder_context_.tx_depth_cdf[cdf_index][context];
+  const int tx_depth = (cdf_index == 0)
+                           ? static_cast<int>(reader_.ReadSymbol(cdf))
+                           : reader_.ReadSymbol<3>(cdf);
+  assert(tx_depth < 3);
+  TransformSize tx_size = max_rect_tx_size;
+  if (tx_depth == 0) return tx_size;
+  tx_size = kSplitTransformSize[tx_size];
+  if (tx_depth == 1) return tx_size;
+  return kSplitTransformSize[tx_size];
+}
+
+void Tile::ReadVariableTransformTree(const Block& block, int row4x4,
+                                     int column4x4, TransformSize tx_size) {
+  const uint8_t pixels = std::max(block.width, block.height);
+  const TransformSize max_tx_size = GetSquareTransformSize(pixels);
+  const int context_delta = (kNumSquareTransformSizes - 1 -
+                             TransformSizeToSquareTransformIndex(max_tx_size)) *
+                            6;
+
+  // Branching factor is 4 and maximum depth is 2. So the maximum stack size
+  // necessary is (4 - 1) + 4 = 7.
+  Stack<TransformTreeNode, 7> stack;
+  stack.Push(TransformTreeNode(column4x4, row4x4, tx_size, 0));
+
+  do {
+    TransformTreeNode node = stack.Pop();
+    const int tx_width4x4 = kTransformWidth4x4[node.tx_size];
+    const int tx_height4x4 = kTransformHeight4x4[node.tx_size];
+    if (node.tx_size != kTransformSize4x4 &&
+        node.depth != kMaxVariableTransformTreeDepth) {
+      const auto top =
+          static_cast<int>(GetTopTransformWidth(block, node.y, node.x, false) <
+                           kTransformWidth[node.tx_size]);
+      const auto left = static_cast<int>(
+          GetLeftTransformHeight(block, node.y, node.x, false) <
+          kTransformHeight[node.tx_size]);
+      const int context =
+          static_cast<int>(max_tx_size > kTransformSize8x8 &&
+                           kTransformSizeSquareMax[node.tx_size] !=
+                               max_tx_size) *
+              3 +
+          context_delta + top + left;
+      // tx_split.
+      if (reader_.ReadSymbol(symbol_decoder_context_.tx_split_cdf[context])) {
+        const TransformSize sub_tx_size = kSplitTransformSize[node.tx_size];
+        const int step_width4x4 = kTransformWidth4x4[sub_tx_size];
+        const int step_height4x4 = kTransformHeight4x4[sub_tx_size];
+        // The loops have to run in reverse order because we use a stack for
+        // DFS.
+        for (int i = tx_height4x4 - step_height4x4; i >= 0;
+             i -= step_height4x4) {
+          for (int j = tx_width4x4 - step_width4x4; j >= 0;
+               j -= step_width4x4) {
+            if (node.y + i >= frame_header_.rows4x4 ||
+                node.x + j >= frame_header_.columns4x4) {
+              continue;
+            }
+            stack.Push(TransformTreeNode(node.x + j, node.y + i, sub_tx_size,
+                                         node.depth + 1));
+          }
+        }
+        continue;
+      }
+    }
+    // tx_split is false.
+    for (int i = 0; i < tx_height4x4; ++i) {
+      static_assert(sizeof(TransformSize) == 1, "");
+      memset(&inter_transform_sizes_[node.y + i][node.x], node.tx_size,
+             tx_width4x4);
+    }
+    block_parameters_holder_.Find(node.y, node.x)->transform_size =
+        node.tx_size;
+  } while (!stack.Empty());
+}
+
+void Tile::DecodeTransformSize(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.tx_mode == kTxModeSelect && block.size > kBlock4x4 &&
+      bp.is_inter && !bp.skip &&
+      !frame_header_.segmentation.lossless[bp.segment_id]) {
+    const TransformSize max_tx_size = kMaxTransformSizeRectangle[block.size];
+    const int tx_width4x4 = kTransformWidth4x4[max_tx_size];
+    const int tx_height4x4 = kTransformHeight4x4[max_tx_size];
+    for (int row = block.row4x4; row < block.row4x4 + block.height4x4;
+         row += tx_height4x4) {
+      for (int column = block.column4x4;
+           column < block.column4x4 + block.width4x4; column += tx_width4x4) {
+        ReadVariableTransformTree(block, row, column, max_tx_size);
+      }
+    }
+  } else {
+    bp.transform_size = ReadFixedTransformSize(block);
+    for (int row = block.row4x4; row < block.row4x4 + block.height4x4; ++row) {
+      static_assert(sizeof(TransformSize) == 1, "");
+      memset(&inter_transform_sizes_[row][block.column4x4], bp.transform_size,
+             block.width4x4);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/tile/prediction.cc b/src/tile/prediction.cc
new file mode 100644
index 0000000..c5560a6
--- /dev/null
+++ b/src/tile/prediction.cc
@@ -0,0 +1,1361 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "src/warp_prediction.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/inter_intra_masks.inc"
+
+// Precision bits when scaling reference frames.
+constexpr int kReferenceScaleShift = 14;
+constexpr int kAngleStep = 3;
+constexpr int kPredictionModeToAngle[kIntraPredictionModesUV] = {
+    0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0};
+
+// The following modes need both the left_column and top_row for intra
+// prediction. For directional modes left/top requirement is inferred based on
+// the prediction angle. For Dc modes, left/top requirement is inferred based on
+// whether or not left/top is available.
+constexpr BitMaskSet kNeedsLeftAndTop(kPredictionModeSmooth,
+                                      kPredictionModeSmoothHorizontal,
+                                      kPredictionModeSmoothVertical,
+                                      kPredictionModePaeth);
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+  assert(angle >= 3);
+  assert(angle <= 87);
+  return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+// Maps the block_size to an index as follows:
+//  kBlock8x8 => 0.
+//  kBlock8x16 => 1.
+//  kBlock8x32 => 2.
+//  kBlock16x8 => 3.
+//  kBlock16x16 => 4.
+//  kBlock16x32 => 5.
+//  kBlock32x8 => 6.
+//  kBlock32x16 => 7.
+//  kBlock32x32 => 8.
+int GetWedgeBlockSizeIndex(BlockSize block_size) {
+  assert(block_size >= kBlock8x8);
+  return block_size - kBlock8x8 - static_cast<int>(block_size >= kBlock16x8) -
+         static_cast<int>(block_size >= kBlock32x8);
+}
+
+// Maps a dimension of 4, 8, 16 and 32 to indices 0, 1, 2 and 3 respectively.
+int GetInterIntraMaskLookupIndex(int dimension) {
+  assert(dimension == 4 || dimension == 8 || dimension == 16 ||
+         dimension == 32);
+  return FloorLog2(dimension) - 2;
+}
+
+// 7.11.2.9.
+int GetIntraEdgeFilterStrength(int width, int height, int filter_type,
+                               int delta) {
+  const int sum = width + height;
+  delta = std::abs(delta);
+  if (filter_type == 0) {
+    if (sum <= 8) {
+      if (delta >= 56) return 1;
+    } else if (sum <= 16) {
+      if (delta >= 40) return 1;
+    } else if (sum <= 24) {
+      if (delta >= 32) return 3;
+      if (delta >= 16) return 2;
+      if (delta >= 8) return 1;
+    } else if (sum <= 32) {
+      if (delta >= 32) return 3;
+      if (delta >= 4) return 2;
+      return 1;
+    } else {
+      return 3;
+    }
+  } else {
+    if (sum <= 8) {
+      if (delta >= 64) return 2;
+      if (delta >= 40) return 1;
+    } else if (sum <= 16) {
+      if (delta >= 48) return 2;
+      if (delta >= 20) return 1;
+    } else if (sum <= 24) {
+      if (delta >= 4) return 3;
+    } else {
+      return 3;
+    }
+  }
+  return 0;
+}
+
+// 7.11.2.10.
+bool DoIntraEdgeUpsampling(int width, int height, int filter_type, int delta) {
+  const int sum = width + height;
+  delta = std::abs(delta);
+  // This function should not be called when the prediction angle is 90 or 180.
+  assert(delta != 0);
+  if (delta >= 40) return false;
+  return (filter_type == 1) ? sum <= 8 : sum <= 16;
+}
+
+constexpr uint8_t kQuantizedDistanceWeight[4][2] = {
+    {2, 3}, {2, 5}, {2, 7}, {1, kMaxFrameDistance}};
+
+constexpr uint8_t kQuantizedDistanceLookup[4][2] = {
+    {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+void GetDistanceWeights(const int distance[2], int weight[2]) {
+  // Note: distance[0] and distance[1] correspond to relative distance
+  // between current frame and reference frame [1] and [0], respectively.
+  const int order = static_cast<int>(distance[0] <= distance[1]);
+  if (distance[0] == 0 || distance[1] == 0) {
+    weight[0] = kQuantizedDistanceLookup[3][order];
+    weight[1] = kQuantizedDistanceLookup[3][1 - order];
+  } else {
+    int i;
+    for (i = 0; i < 3; ++i) {
+      const int weight_0 = kQuantizedDistanceWeight[i][order];
+      const int weight_1 = kQuantizedDistanceWeight[i][1 - order];
+      if (order == 0) {
+        if (distance[0] * weight_0 < distance[1] * weight_1) break;
+      } else {
+        if (distance[0] * weight_0 > distance[1] * weight_1) break;
+      }
+    }
+    weight[0] = kQuantizedDistanceLookup[i][order];
+    weight[1] = kQuantizedDistanceLookup[i][1 - order];
+  }
+}
+
+dsp::IntraPredictor GetIntraPredictor(PredictionMode mode, bool has_left,
+                                      bool has_top) {
+  if (mode == kPredictionModeDc) {
+    if (has_left && has_top) {
+      return dsp::kIntraPredictorDc;
+    }
+    if (has_left) {
+      return dsp::kIntraPredictorDcLeft;
+    }
+    if (has_top) {
+      return dsp::kIntraPredictorDcTop;
+    }
+    return dsp::kIntraPredictorDcFill;
+  }
+  switch (mode) {
+    case kPredictionModePaeth:
+      return dsp::kIntraPredictorPaeth;
+    case kPredictionModeSmooth:
+      return dsp::kIntraPredictorSmooth;
+    case kPredictionModeSmoothVertical:
+      return dsp::kIntraPredictorSmoothVertical;
+    case kPredictionModeSmoothHorizontal:
+      return dsp::kIntraPredictorSmoothHorizontal;
+    default:
+      return dsp::kNumIntraPredictors;
+  }
+}
+
+uint8_t* GetStartPoint(Array2DView<uint8_t>* const buffer, const int plane,
+                       const int x, const int y, const int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) {
+    Array2DView<uint16_t> buffer16(
+        buffer[plane].rows(), buffer[plane].columns() / sizeof(uint16_t),
+        reinterpret_cast<uint16_t*>(&buffer[plane][0][0]));
+    return reinterpret_cast<uint8_t*>(&buffer16[y][x]);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+  static_cast<void>(bitdepth);
+  return &buffer[plane][y][x];
+}
+
+int GetPixelPositionFromHighScale(int start, int step, int offset) {
+  return (start + step * offset) >> kScaleSubPixelBits;
+}
+
+dsp::MaskBlendFunc GetMaskBlendFunc(const dsp::Dsp& dsp, bool is_inter_intra,
+                                    bool is_wedge_inter_intra,
+                                    int subsampling_x, int subsampling_y) {
+  return (is_inter_intra && !is_wedge_inter_intra)
+             ? dsp.mask_blend[0][/*is_inter_intra=*/true]
+             : dsp.mask_blend[subsampling_x + subsampling_y][is_inter_intra];
+}
+
+}  // namespace
+
+template <typename Pixel>
+void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
+                           bool has_left, bool has_top, bool has_top_right,
+                           bool has_bottom_left, PredictionMode mode,
+                           TransformSize tx_size) {
+  const int width = 1 << kTransformWidthLog2[tx_size];
+  const int height = 1 << kTransformHeightLog2[tx_size];
+  const int x_shift = subsampling_x_[plane];
+  const int y_shift = subsampling_y_[plane];
+  const int max_x = (MultiplyBy4(frame_header_.columns4x4) >> x_shift) - 1;
+  const int max_y = (MultiplyBy4(frame_header_.rows4x4) >> y_shift) - 1;
+  // For performance reasons, do not initialize the following two buffers.
+  alignas(kMaxAlignment) Pixel top_row_data[160];
+  alignas(kMaxAlignment) Pixel left_column_data[160];
+#if LIBGAV1_MSAN
+  if (IsDirectionalMode(mode)) {
+    memset(top_row_data, 0, sizeof(top_row_data));
+    memset(left_column_data, 0, sizeof(left_column_data));
+  }
+#endif
+  // Some predictors use |top_row_data| and |left_column_data| with a negative
+  // offset to access pixels to the top-left of the current block. So have some
+  // space before the arrays to allow populating those without having to move
+  // the rest of the array.
+  Pixel* const top_row = top_row_data + 16;
+  Pixel* const left_column = left_column_data + 16;
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const int top_and_left_size = width + height;
+  const bool is_directional_mode = IsDirectionalMode(mode);
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  const bool use_filter_intra =
+      (plane == kPlaneY && prediction_parameters.use_filter_intra);
+  const int prediction_angle =
+      is_directional_mode
+          ? kPredictionModeToAngle[mode] +
+                prediction_parameters.angle_delta[GetPlaneType(plane)] *
+                    kAngleStep
+          : 0;
+  // Directional prediction requires buffers larger than the width or height.
+  const int top_size = is_directional_mode ? top_and_left_size : width;
+  const int left_size = is_directional_mode ? top_and_left_size : height;
+  const int top_right_size =
+      is_directional_mode ? (has_top_right ? 2 : 1) * width : width;
+  const int bottom_left_size =
+      is_directional_mode ? (has_bottom_left ? 2 : 1) * height : height;
+
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  const bool needs_top = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+                         (is_directional_mode && prediction_angle < 180) ||
+                         (mode == kPredictionModeDc && has_top);
+  const bool needs_left = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+                          (is_directional_mode && prediction_angle > 90) ||
+                          (mode == kPredictionModeDc && has_left);
+
+  const Pixel* top_row_src = buffer[y - 1];
+
+  // Determine if we need to retrieve the top row from
+  // |intra_prediction_buffer_|.
+  if ((needs_top || needs_left) && use_intra_prediction_buffer_) {
+    // Superblock index of block.row4x4. block.row4x4 is always in luma
+    // dimension (no subsampling).
+    const int current_superblock_index =
+        block.row4x4 >> (sequence_header_.use_128x128_superblock ? 5 : 4);
+    // Superblock index of y - 1. y is in the plane dimension (chroma planes
+    // could be subsampled).
+    const int plane_shift = (sequence_header_.use_128x128_superblock ? 7 : 6) -
+                            subsampling_y_[plane];
+    const int top_row_superblock_index = (y - 1) >> plane_shift;
+    // If the superblock index of y - 1 is not that of the current superblock,
+    // then we will have to retrieve the top row from the
+    // |intra_prediction_buffer_|.
+    if (current_superblock_index != top_row_superblock_index) {
+      top_row_src = reinterpret_cast<const Pixel*>(
+          (*intra_prediction_buffer_)[plane].get());
+    }
+  }
+
+  if (needs_top) {
+    // Compute top_row.
+    if (has_top || has_left) {
+      const int left_index = has_left ? x - 1 : x;
+      top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index];
+    } else {
+      top_row[-1] = 1 << (bitdepth - 1);
+    }
+    if (!has_top && has_left) {
+      Memset(top_row, buffer[y][x - 1], top_size);
+    } else if (!has_top && !has_left) {
+      Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size);
+    } else {
+      const int top_limit = std::min(max_x - x + 1, top_right_size);
+      memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel));
+      // Even though it is safe to call Memset with a size of 0, accessing
+      // top_row_src[top_limit - x + 1] is not allowed when this condition is
+      // false.
+      if (top_size - top_limit > 0) {
+        Memset(top_row + top_limit, top_row_src[top_limit + x - 1],
+               top_size - top_limit);
+      }
+    }
+  }
+  if (needs_left) {
+    // Compute left_column.
+    if (has_top || has_left) {
+      const int left_index = has_left ? x - 1 : x;
+      left_column[-1] =
+          has_top ? top_row_src[left_index] : buffer[y][left_index];
+    } else {
+      left_column[-1] = 1 << (bitdepth - 1);
+    }
+    if (!has_left && has_top) {
+      Memset(left_column, top_row_src[x], left_size);
+    } else if (!has_left && !has_top) {
+      Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size);
+    } else {
+      const int left_limit = std::min(max_y - y + 1, bottom_left_size);
+      for (int i = 0; i < left_limit; ++i) {
+        left_column[i] = buffer[y + i][x - 1];
+      }
+      // Even though it is safe to call Memset with a size of 0, accessing
+      // buffer[left_limit - y + 1][x - 1] is not allowed when this condition is
+      // false.
+      if (left_size - left_limit > 0) {
+        Memset(left_column + left_limit, buffer[left_limit + y - 1][x - 1],
+               left_size - left_limit);
+      }
+    }
+  }
+  Pixel* const dest = &buffer[y][x];
+  const ptrdiff_t dest_stride = buffer_[plane].columns();
+  if (use_filter_intra) {
+    dsp_.filter_intra_predictor(dest, dest_stride, top_row, left_column,
+                                prediction_parameters.filter_intra_mode, width,
+                                height);
+  } else if (is_directional_mode) {
+    DirectionalPrediction(block, plane, x, y, has_left, has_top, needs_left,
+                          needs_top, prediction_angle, width, height, max_x,
+                          max_y, tx_size, top_row, left_column);
+  } else {
+    const dsp::IntraPredictor predictor =
+        GetIntraPredictor(mode, has_left, has_top);
+    assert(predictor != dsp::kNumIntraPredictors);
+    dsp_.intra_predictors[tx_size][predictor](dest, dest_stride, top_row,
+                                              left_column);
+  }
+}
+
+template void Tile::IntraPrediction<uint8_t>(const Block& block, Plane plane,
+                                             int x, int y, bool has_left,
+                                             bool has_top, bool has_top_right,
+                                             bool has_bottom_left,
+                                             PredictionMode mode,
+                                             TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::IntraPrediction<uint16_t>(const Block& block, Plane plane,
+                                              int x, int y, bool has_left,
+                                              bool has_top, bool has_top_right,
+                                              bool has_bottom_left,
+                                              PredictionMode mode,
+                                              TransformSize tx_size);
+#endif
+
+constexpr BitMaskSet kPredictionModeSmoothMask(kPredictionModeSmooth,
+                                               kPredictionModeSmoothHorizontal,
+                                               kPredictionModeSmoothVertical);
+
+bool Tile::IsSmoothPrediction(int row, int column, Plane plane) const {
+  const BlockParameters& bp = *block_parameters_holder_.Find(row, column);
+  PredictionMode mode;
+  if (plane == kPlaneY) {
+    mode = bp.y_mode;
+  } else {
+    if (bp.reference_frame[0] > kReferenceFrameIntra) return false;
+    mode = bp.uv_mode;
+  }
+  return kPredictionModeSmoothMask.Contains(mode);
+}
+
+int Tile::GetIntraEdgeFilterType(const Block& block, Plane plane) const {
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  if (block.top_available[plane]) {
+    const int row = block.row4x4 - 1 - (block.row4x4 & subsampling_y);
+    const int column = block.column4x4 + (~block.column4x4 & subsampling_x);
+    if (IsSmoothPrediction(row, column, plane)) return 1;
+  }
+  if (block.left_available[plane]) {
+    const int row = block.row4x4 + (~block.row4x4 & subsampling_y);
+    const int column = block.column4x4 - 1 - (block.column4x4 & subsampling_x);
+    if (IsSmoothPrediction(row, column, plane)) return 1;
+  }
+  return 0;
+}
+
+template <typename Pixel>
+void Tile::DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+                                 bool has_left, bool has_top, bool needs_left,
+                                 bool needs_top, int prediction_angle,
+                                 int width, int height, int max_x, int max_y,
+                                 TransformSize tx_size, Pixel* const top_row,
+                                 Pixel* const left_column) {
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  Pixel* const dest = &buffer[y][x];
+  const ptrdiff_t stride = buffer_[plane].columns();
+  if (prediction_angle == 90) {
+    dsp_.intra_predictors[tx_size][dsp::kIntraPredictorVertical](
+        dest, stride, top_row, left_column);
+    return;
+  }
+  if (prediction_angle == 180) {
+    dsp_.intra_predictors[tx_size][dsp::kIntraPredictorHorizontal](
+        dest, stride, top_row, left_column);
+    return;
+  }
+
+  bool upsampled_top = false;
+  bool upsampled_left = false;
+  if (sequence_header_.enable_intra_edge_filter) {
+    const int filter_type = GetIntraEdgeFilterType(block, plane);
+    if (prediction_angle > 90 && prediction_angle < 180 &&
+        (width + height) >= 24) {
+      // 7.11.2.7.
+      left_column[-1] = top_row[-1] = RightShiftWithRounding(
+          left_column[0] * 5 + top_row[-1] * 6 + top_row[0] * 5, 4);
+    }
+    if (has_top && needs_top) {
+      const int strength = GetIntraEdgeFilterStrength(
+          width, height, filter_type, prediction_angle - 90);
+      if (strength > 0) {
+        const int num_pixels = std::min(width, max_x - x + 1) +
+                               ((prediction_angle < 90) ? height : 0) + 1;
+        dsp_.intra_edge_filter(top_row - 1, num_pixels, strength);
+      }
+    }
+    if (has_left && needs_left) {
+      const int strength = GetIntraEdgeFilterStrength(
+          width, height, filter_type, prediction_angle - 180);
+      if (strength > 0) {
+        const int num_pixels = std::min(height, max_y - y + 1) +
+                               ((prediction_angle > 180) ? width : 0) + 1;
+        dsp_.intra_edge_filter(left_column - 1, num_pixels, strength);
+      }
+    }
+    upsampled_top = DoIntraEdgeUpsampling(width, height, filter_type,
+                                          prediction_angle - 90);
+    if (upsampled_top && needs_top) {
+      const int num_pixels = width + ((prediction_angle < 90) ? height : 0);
+      dsp_.intra_edge_upsampler(top_row, num_pixels);
+    }
+    upsampled_left = DoIntraEdgeUpsampling(width, height, filter_type,
+                                           prediction_angle - 180);
+    if (upsampled_left && needs_left) {
+      const int num_pixels = height + ((prediction_angle > 180) ? width : 0);
+      dsp_.intra_edge_upsampler(left_column, num_pixels);
+    }
+  }
+
+  if (prediction_angle < 90) {
+    const int dx = GetDirectionalIntraPredictorDerivative(prediction_angle);
+    dsp_.directional_intra_predictor_zone1(dest, stride, top_row, width, height,
+                                           dx, upsampled_top);
+  } else if (prediction_angle < 180) {
+    const int dx =
+        GetDirectionalIntraPredictorDerivative(180 - prediction_angle);
+    const int dy =
+        GetDirectionalIntraPredictorDerivative(prediction_angle - 90);
+    dsp_.directional_intra_predictor_zone2(dest, stride, top_row, left_column,
+                                           width, height, dx, dy, upsampled_top,
+                                           upsampled_left);
+  } else {
+    assert(prediction_angle < 270);
+    const int dy =
+        GetDirectionalIntraPredictorDerivative(270 - prediction_angle);
+    dsp_.directional_intra_predictor_zone3(dest, stride, left_column, width,
+                                           height, dy, upsampled_left);
+  }
+}
+
+template <typename Pixel>
+void Tile::PalettePrediction(const Block& block, const Plane plane,
+                             const int start_x, const int start_y, const int x,
+                             const int y, const TransformSize tx_size) {
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const uint16_t* const palette = block.bp->palette_mode_info.color[plane];
+  const PlaneType plane_type = GetPlaneType(plane);
+  const int x4 = MultiplyBy4(x);
+  const int y4 = MultiplyBy4(y);
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  for (int row = 0; row < tx_height; ++row) {
+    assert(block.bp->prediction_parameters
+               ->color_index_map[plane_type][y4 + row] != nullptr);
+    for (int column = 0; column < tx_width; ++column) {
+      buffer[start_y + row][start_x + column] =
+          palette[block.bp->prediction_parameters
+                      ->color_index_map[plane_type][y4 + row][x4 + column]];
+    }
+  }
+}
+
+template void Tile::PalettePrediction<uint8_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const int x, const int y, const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::PalettePrediction<uint16_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const int x, const int y, const TransformSize tx_size);
+#endif
+
+template <typename Pixel>
+void Tile::ChromaFromLumaPrediction(const Block& block, const Plane plane,
+                                    const int start_x, const int start_y,
+                                    const TransformSize tx_size) {
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  Array2DView<Pixel> y_buffer(
+      buffer_[kPlaneY].rows(), buffer_[kPlaneY].columns() / sizeof(Pixel),
+      reinterpret_cast<Pixel*>(&buffer_[kPlaneY][0][0]));
+  if (!block.scratch_buffer->cfl_luma_buffer_valid) {
+    const int luma_x = start_x << subsampling_x;
+    const int luma_y = start_y << subsampling_y;
+    dsp_.cfl_subsamplers[tx_size][subsampling_x + subsampling_y](
+        block.scratch_buffer->cfl_luma_buffer,
+        prediction_parameters.max_luma_width - luma_x,
+        prediction_parameters.max_luma_height - luma_y,
+        reinterpret_cast<uint8_t*>(&y_buffer[luma_y][luma_x]),
+        buffer_[kPlaneY].columns());
+    block.scratch_buffer->cfl_luma_buffer_valid = true;
+  }
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  dsp_.cfl_intra_predictors[tx_size](
+      reinterpret_cast<uint8_t*>(&buffer[start_y][start_x]),
+      buffer_[plane].columns(), block.scratch_buffer->cfl_luma_buffer,
+      (plane == kPlaneU) ? prediction_parameters.cfl_alpha_u
+                         : prediction_parameters.cfl_alpha_v);
+}
+
+template void Tile::ChromaFromLumaPrediction<uint8_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::ChromaFromLumaPrediction<uint16_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const TransformSize tx_size);
+#endif
+
+void Tile::InterIntraPrediction(
+    uint16_t* const prediction_0, const uint8_t* const prediction_mask,
+    const ptrdiff_t prediction_mask_stride,
+    const PredictionParameters& prediction_parameters,
+    const int prediction_width, const int prediction_height,
+    const int subsampling_x, const int subsampling_y, uint8_t* const dest,
+    const ptrdiff_t dest_stride) {
+  assert(prediction_mask != nullptr);
+  assert(prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeIntra ||
+         prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeWedge);
+  // The first buffer of InterIntra is from inter prediction.
+  // The second buffer is from intra prediction.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (sequence_header_.color_config.bitdepth > 8) {
+    GetMaskBlendFunc(dsp_, /*is_inter_intra=*/true,
+                     prediction_parameters.is_wedge_inter_intra, subsampling_x,
+                     subsampling_y)(
+        prediction_0, reinterpret_cast<uint16_t*>(dest),
+        dest_stride / sizeof(uint16_t), prediction_mask, prediction_mask_stride,
+        prediction_width, prediction_height, dest, dest_stride);
+    return;
+  }
+#endif
+  const int function_index = prediction_parameters.is_wedge_inter_intra
+                                 ? subsampling_x + subsampling_y
+                                 : 0;
+  // |is_inter_intra| prediction values are stored in a Pixel buffer but it is
+  // currently declared as a uint16_t buffer.
+  // TODO(johannkoenig): convert the prediction buffer to a uint8_t buffer and
+  // remove the reinterpret_cast.
+  dsp_.inter_intra_mask_blend_8bpp[function_index](
+      reinterpret_cast<uint8_t*>(prediction_0), dest, dest_stride,
+      prediction_mask, prediction_mask_stride, prediction_width,
+      prediction_height);
+}
+
+void Tile::CompoundInterPrediction(
+    const Block& block, const uint8_t* const prediction_mask,
+    const ptrdiff_t prediction_mask_stride, const int prediction_width,
+    const int prediction_height, const int subsampling_x,
+    const int subsampling_y, const int candidate_row,
+    const int candidate_column, uint8_t* dest, const ptrdiff_t dest_stride) {
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+
+  void* prediction[2];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  if (bitdepth > 8) {
+    prediction[0] = block.scratch_buffer->prediction_buffer[0];
+    prediction[1] = block.scratch_buffer->prediction_buffer[1];
+  } else {
+#endif
+    prediction[0] = block.scratch_buffer->compound_prediction_buffer_8bpp[0];
+    prediction[1] = block.scratch_buffer->compound_prediction_buffer_8bpp[1];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  }
+#endif
+
+  switch (prediction_parameters.compound_prediction_type) {
+    case kCompoundPredictionTypeWedge:
+    case kCompoundPredictionTypeDiffWeighted:
+      GetMaskBlendFunc(dsp_, /*is_inter_intra=*/false,
+                       prediction_parameters.is_wedge_inter_intra,
+                       subsampling_x, subsampling_y)(
+          prediction[0], prediction[1],
+          /*prediction_stride=*/prediction_width, prediction_mask,
+          prediction_mask_stride, prediction_width, prediction_height, dest,
+          dest_stride);
+      break;
+    case kCompoundPredictionTypeDistance:
+      DistanceWeightedPrediction(prediction[0], prediction[1], prediction_width,
+                                 prediction_height, candidate_row,
+                                 candidate_column, dest, dest_stride);
+      break;
+    default:
+      assert(prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeAverage);
+      dsp_.average_blend(prediction[0], prediction[1], prediction_width,
+                         prediction_height, dest, dest_stride);
+      break;
+  }
+}
+
+GlobalMotion* Tile::GetWarpParams(
+    const Block& block, const Plane plane, const int prediction_width,
+    const int prediction_height,
+    const PredictionParameters& prediction_parameters,
+    const ReferenceFrameType reference_type, bool* const is_local_valid,
+    GlobalMotion* const global_motion_params,
+    GlobalMotion* const local_warp_params) const {
+  if (prediction_width < 8 || prediction_height < 8 ||
+      frame_header_.force_integer_mv == 1) {
+    return nullptr;
+  }
+  if (plane == kPlaneY) {
+    *is_local_valid =
+        prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+        WarpEstimation(
+            prediction_parameters.num_warp_samples, DivideBy4(prediction_width),
+            DivideBy4(prediction_height), block.row4x4, block.column4x4,
+            block.bp->mv.mv[0], prediction_parameters.warp_estimate_candidates,
+            local_warp_params) &&
+        SetupShear(local_warp_params);
+  }
+  if (prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+      *is_local_valid) {
+    return local_warp_params;
+  }
+  if (!IsScaled(reference_type)) {
+    GlobalMotionTransformationType global_motion_type =
+        (reference_type != kReferenceFrameIntra)
+            ? global_motion_params->type
+            : kNumGlobalMotionTransformationTypes;
+    const bool is_global_valid =
+        IsGlobalMvBlock(block.bp->is_global_mv_block, global_motion_type) &&
+        SetupShear(global_motion_params);
+    // Valid global motion type implies reference type can't be intra.
+    assert(!is_global_valid || reference_type != kReferenceFrameIntra);
+    if (is_global_valid) return global_motion_params;
+  }
+  return nullptr;
+}
+
+bool Tile::InterPrediction(const Block& block, const Plane plane, const int x,
+                           const int y, const int prediction_width,
+                           const int prediction_height, int candidate_row,
+                           int candidate_column, bool* const is_local_valid,
+                           GlobalMotion* const local_warp_params) {
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const BlockParameters& bp = *block.bp;
+  const BlockParameters& bp_reference =
+      *block_parameters_holder_.Find(candidate_row, candidate_column);
+  const bool is_compound =
+      bp_reference.reference_frame[1] > kReferenceFrameIntra;
+  assert(bp.is_inter);
+  const bool is_inter_intra = bp.reference_frame[1] == kReferenceFrameIntra;
+
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  uint8_t* const dest = GetStartPoint(buffer_, plane, x, y, bitdepth);
+  const ptrdiff_t dest_stride = buffer_[plane].columns();  // In bytes.
+  for (int index = 0; index < 1 + static_cast<int>(is_compound); ++index) {
+    const ReferenceFrameType reference_type =
+        bp_reference.reference_frame[index];
+    GlobalMotion global_motion_params =
+        frame_header_.global_motion[reference_type];
+    GlobalMotion* warp_params =
+        GetWarpParams(block, plane, prediction_width, prediction_height,
+                      prediction_parameters, reference_type, is_local_valid,
+                      &global_motion_params, local_warp_params);
+    if (warp_params != nullptr) {
+      if (!BlockWarpProcess(block, plane, index, x, y, prediction_width,
+                            prediction_height, warp_params, is_compound,
+                            is_inter_intra, dest, dest_stride)) {
+        return false;
+      }
+    } else {
+      const int reference_index =
+          prediction_parameters.use_intra_block_copy
+              ? -1
+              : frame_header_.reference_frame_index[reference_type -
+                                                    kReferenceFrameLast];
+      if (!BlockInterPrediction(
+              block, plane, reference_index, bp_reference.mv.mv[index], x, y,
+              prediction_width, prediction_height, candidate_row,
+              candidate_column, block.scratch_buffer->prediction_buffer[index],
+              is_compound, is_inter_intra, dest, dest_stride)) {
+        return false;
+      }
+    }
+  }
+
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  ptrdiff_t prediction_mask_stride = 0;
+  const uint8_t* prediction_mask = nullptr;
+  if (prediction_parameters.compound_prediction_type ==
+      kCompoundPredictionTypeWedge) {
+    const Array2D<uint8_t>& wedge_mask =
+        wedge_masks_[GetWedgeBlockSizeIndex(block.size)]
+                    [prediction_parameters.wedge_sign]
+                    [prediction_parameters.wedge_index];
+    prediction_mask = wedge_mask[0];
+    prediction_mask_stride = wedge_mask.columns();
+  } else if (prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeIntra) {
+    // 7.11.3.13. The inter intra masks are precomputed and stored as a set of
+    // look up tables.
+    assert(prediction_parameters.inter_intra_mode < kNumInterIntraModes);
+    prediction_mask =
+        kInterIntraMasks[prediction_parameters.inter_intra_mode]
+                        [GetInterIntraMaskLookupIndex(prediction_width)]
+                        [GetInterIntraMaskLookupIndex(prediction_height)];
+    prediction_mask_stride = prediction_width;
+  } else if (prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeDiffWeighted) {
+    if (plane == kPlaneY) {
+      assert(prediction_width >= 8);
+      assert(prediction_height >= 8);
+      dsp_.weight_mask[FloorLog2(prediction_width) - 3]
+                      [FloorLog2(prediction_height) - 3]
+                      [static_cast<int>(prediction_parameters.mask_is_inverse)](
+                          block.scratch_buffer->prediction_buffer[0],
+                          block.scratch_buffer->prediction_buffer[1],
+                          block.scratch_buffer->weight_mask,
+                          kMaxSuperBlockSizeInPixels);
+    }
+    prediction_mask = block.scratch_buffer->weight_mask;
+    prediction_mask_stride = kMaxSuperBlockSizeInPixels;
+  }
+
+  if (is_compound) {
+    CompoundInterPrediction(block, prediction_mask, prediction_mask_stride,
+                            prediction_width, prediction_height, subsampling_x,
+                            subsampling_y, candidate_row, candidate_column,
+                            dest, dest_stride);
+  } else if (prediction_parameters.motion_mode == kMotionModeObmc) {
+    // Obmc mode is allowed only for single reference (!is_compound).
+    return ObmcPrediction(block, plane, prediction_width, prediction_height);
+  } else if (is_inter_intra) {
+    // InterIntra and obmc must be mutually exclusive.
+    InterIntraPrediction(
+        block.scratch_buffer->prediction_buffer[0], prediction_mask,
+        prediction_mask_stride, prediction_parameters, prediction_width,
+        prediction_height, subsampling_x, subsampling_y, dest, dest_stride);
+  }
+  return true;
+}
+
+bool Tile::ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+                               const Plane plane,
+                               const int reference_frame_index, const int width,
+                               const int height, const int x, const int y,
+                               const int candidate_row,
+                               const int candidate_column,
+                               const ObmcDirection blending_direction) {
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  // Obmc's prediction needs to be clipped before blending with above/left
+  // prediction blocks.
+  // Obmc prediction is used only when is_compound is false. So it is safe to
+  // use prediction_buffer[1] as a temporary buffer for the Obmc prediction.
+  static_assert(sizeof(block.scratch_buffer->prediction_buffer[1]) >=
+                    64 * 64 * sizeof(uint16_t),
+                "");
+  auto* const obmc_buffer =
+      reinterpret_cast<uint8_t*>(block.scratch_buffer->prediction_buffer[1]);
+  const ptrdiff_t obmc_buffer_stride =
+      (bitdepth == 8) ? width : width * sizeof(uint16_t);
+  if (!BlockInterPrediction(block, plane, reference_frame_index, mv, x, y,
+                            width, height, candidate_row, candidate_column,
+                            nullptr, false, false, obmc_buffer,
+                            obmc_buffer_stride)) {
+    return false;
+  }
+
+  uint8_t* const prediction = GetStartPoint(buffer_, plane, x, y, bitdepth);
+  const ptrdiff_t prediction_stride = buffer_[plane].columns();
+  dsp_.obmc_blend[blending_direction](prediction, prediction_stride, width,
+                                      height, obmc_buffer, obmc_buffer_stride);
+  return true;
+}
+
+bool Tile::ObmcPrediction(const Block& block, const Plane plane,
+                          const int width, const int height) {
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  if (block.top_available[kPlaneY] &&
+      !IsBlockSmallerThan8x8(block.residual_size[plane])) {
+    const int num_limit = std::min(uint8_t{4}, k4x4WidthLog2[block.size]);
+    const int column4x4_max =
+        std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+    const int candidate_row = block.row4x4 - 1;
+    const int block_start_y = MultiplyBy4(block.row4x4) >> subsampling_y;
+    int column4x4 = block.column4x4;
+    const int prediction_height = std::min(height >> 1, 32 >> subsampling_y);
+    for (int i = 0, step; i < num_limit && column4x4 < column4x4_max;
+         column4x4 += step) {
+      const int candidate_column = column4x4 | 1;
+      const BlockParameters& bp_top =
+          *block_parameters_holder_.Find(candidate_row, candidate_column);
+      const int candidate_block_size = bp_top.size;
+      step = Clip3(kNum4x4BlocksWide[candidate_block_size], 2, 16);
+      if (bp_top.reference_frame[0] > kReferenceFrameIntra) {
+        i++;
+        const int candidate_reference_frame_index =
+            frame_header_.reference_frame_index[bp_top.reference_frame[0] -
+                                                kReferenceFrameLast];
+        const int prediction_width =
+            std::min(width, MultiplyBy4(step) >> subsampling_x);
+        if (!ObmcBlockPrediction(
+                block, bp_top.mv.mv[0], plane, candidate_reference_frame_index,
+                prediction_width, prediction_height,
+                MultiplyBy4(column4x4) >> subsampling_x, block_start_y,
+                candidate_row, candidate_column, kObmcDirectionVertical)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  if (block.left_available[kPlaneY]) {
+    const int num_limit = std::min(uint8_t{4}, k4x4HeightLog2[block.size]);
+    const int row4x4_max =
+        std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+    const int candidate_column = block.column4x4 - 1;
+    int row4x4 = block.row4x4;
+    const int block_start_x = MultiplyBy4(block.column4x4) >> subsampling_x;
+    const int prediction_width = std::min(width >> 1, 32 >> subsampling_x);
+    for (int i = 0, step; i < num_limit && row4x4 < row4x4_max;
+         row4x4 += step) {
+      const int candidate_row = row4x4 | 1;
+      const BlockParameters& bp_left =
+          *block_parameters_holder_.Find(candidate_row, candidate_column);
+      const int candidate_block_size = bp_left.size;
+      step = Clip3(kNum4x4BlocksHigh[candidate_block_size], 2, 16);
+      if (bp_left.reference_frame[0] > kReferenceFrameIntra) {
+        i++;
+        const int candidate_reference_frame_index =
+            frame_header_.reference_frame_index[bp_left.reference_frame[0] -
+                                                kReferenceFrameLast];
+        const int prediction_height =
+            std::min(height, MultiplyBy4(step) >> subsampling_y);
+        if (!ObmcBlockPrediction(
+                block, bp_left.mv.mv[0], plane, candidate_reference_frame_index,
+                prediction_width, prediction_height, block_start_x,
+                MultiplyBy4(row4x4) >> subsampling_y, candidate_row,
+                candidate_column, kObmcDirectionHorizontal)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+                                      const int width, const int height,
+                                      const int candidate_row,
+                                      const int candidate_column, uint8_t* dest,
+                                      ptrdiff_t dest_stride) {
+  int distance[2];
+  int weight[2];
+  for (int reference = 0; reference < 2; ++reference) {
+    const BlockParameters& bp =
+        *block_parameters_holder_.Find(candidate_row, candidate_column);
+    // Note: distance[0] and distance[1] correspond to relative distance
+    // between current frame and reference frame [1] and [0], respectively.
+    distance[1 - reference] = std::min(
+        std::abs(static_cast<int>(
+            current_frame_.reference_info()
+                ->relative_distance_from[bp.reference_frame[reference]])),
+        static_cast<int>(kMaxFrameDistance));
+  }
+  GetDistanceWeights(distance, weight);
+
+  dsp_.distance_weighted_blend(prediction_0, prediction_1, weight[0], weight[1],
+                               width, height, dest, dest_stride);
+}
+
+void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
+                             const int reference_frame_index, const int x,
+                             const int y, int* const start_x,
+                             int* const start_y, int* const step_x,
+                             int* const step_y) {
+  const int reference_upscaled_width =
+      (reference_frame_index == -1)
+          ? frame_header_.upscaled_width
+          : reference_frames_[reference_frame_index]->upscaled_width();
+  const int reference_height =
+      (reference_frame_index == -1)
+          ? frame_header_.height
+          : reference_frames_[reference_frame_index]->frame_height();
+  assert(2 * frame_header_.width >= reference_upscaled_width &&
+         2 * frame_header_.height >= reference_height &&
+         frame_header_.width <= 16 * reference_upscaled_width &&
+         frame_header_.height <= 16 * reference_height);
+  const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
+  const bool is_scaled_y = reference_height != frame_header_.height;
+  const int half_sample = 1 << (kSubPixelBits - 1);
+  int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
+  int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
+  const int rounding_offset =
+      DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
+  if (is_scaled_x) {
+    const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
+                         DivideBy2(frame_header_.width)) /
+                        frame_header_.width;
+    *step_x = RightShiftWithRoundingSigned(
+        scale_x, kReferenceScaleShift - kScaleSubPixelBits);
+    orig_x += half_sample;
+    // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
+    // be up to 15 bits. So we use int64_t to hold base_x.
+    const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
+                           (half_sample << kReferenceScaleShift);
+    *start_x =
+        RightShiftWithRoundingSigned(
+            base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+        rounding_offset;
+  } else {
+    *step_x = 1 << kScaleSubPixelBits;
+    *start_x = LeftShift(orig_x, 6) + rounding_offset;
+  }
+  if (is_scaled_y) {
+    const int scale_y = ((reference_height << kReferenceScaleShift) +
+                         DivideBy2(frame_header_.height)) /
+                        frame_header_.height;
+    *step_y = RightShiftWithRoundingSigned(
+        scale_y, kReferenceScaleShift - kScaleSubPixelBits);
+    orig_y += half_sample;
+    const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
+                           (half_sample << kReferenceScaleShift);
+    *start_y =
+        RightShiftWithRoundingSigned(
+            base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+        rounding_offset;
+  } else {
+    *step_y = 1 << kScaleSubPixelBits;
+    *start_y = LeftShift(orig_y, 6) + rounding_offset;
+  }
+}
+
+// static.
+bool Tile::GetReferenceBlockPosition(
+    const int reference_frame_index, const bool is_scaled, const int width,
+    const int height, const int ref_start_x, const int ref_last_x,
+    const int ref_start_y, const int ref_last_y, const int start_x,
+    const int start_y, const int step_x, const int step_y,
+    const int left_border, const int right_border, const int top_border,
+    const int bottom_border, int* ref_block_start_x, int* ref_block_start_y,
+    int* ref_block_end_x) {
+  *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0);
+  *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0);
+  if (reference_frame_index == -1) {
+    return false;
+  }
+  *ref_block_start_x -= kConvolveBorderLeftTop;
+  *ref_block_start_y -= kConvolveBorderLeftTop;
+  *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) +
+                     kConvolveBorderRight;
+  int ref_block_end_y =
+      GetPixelPositionFromHighScale(start_y, step_y, height - 1) +
+      kConvolveBorderBottom;
+  if (is_scaled) {
+    const int block_height =
+        (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+         kScaleSubPixelBits) +
+        kSubPixelTaps;
+    ref_block_end_y = *ref_block_start_y + block_height - 1;
+  }
+  // Determines if we need to extend beyond the left/right/top/bottom border.
+  return *ref_block_start_x < (ref_start_x - left_border) ||
+         *ref_block_end_x > (ref_last_x + right_border) ||
+         *ref_block_start_y < (ref_start_y - top_border) ||
+         ref_block_end_y > (ref_last_y + bottom_border);
+}
+
+// Builds a block as the input for convolve, by copying the content of
+// reference frame (either a decoded reference frame, or current frame).
+// |block_extended_width| is the combined width of the block and its borders.
+template <typename Pixel>
+void Tile::BuildConvolveBlock(
+    const Plane plane, const int reference_frame_index, const bool is_scaled,
+    const int height, const int ref_start_x, const int ref_last_x,
+    const int ref_start_y, const int ref_last_y, const int step_y,
+    const int ref_block_start_x, const int ref_block_end_x,
+    const int ref_block_start_y, uint8_t* block_buffer,
+    ptrdiff_t convolve_buffer_stride, ptrdiff_t block_extended_width) {
+  const YuvBuffer* const reference_buffer =
+      (reference_frame_index == -1)
+          ? current_frame_.buffer()
+          : reference_frames_[reference_frame_index]->buffer();
+  Array2DView<const Pixel> reference_block(
+      reference_buffer->height(plane),
+      reference_buffer->stride(plane) / sizeof(Pixel),
+      reinterpret_cast<const Pixel*>(reference_buffer->data(plane)));
+  auto* const block_head = reinterpret_cast<Pixel*>(block_buffer);
+  convolve_buffer_stride /= sizeof(Pixel);
+  int block_height = height + kConvolveBorderLeftTop + kConvolveBorderBottom;
+  if (is_scaled) {
+    block_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+                    kScaleSubPixelBits) +
+                   kSubPixelTaps;
+  }
+  const int copy_start_x = Clip3(ref_block_start_x, ref_start_x, ref_last_x);
+  const int copy_start_y = Clip3(ref_block_start_y, ref_start_y, ref_last_y);
+  const int copy_end_x = Clip3(ref_block_end_x, copy_start_x, ref_last_x);
+  const int block_width = copy_end_x - copy_start_x + 1;
+  const bool extend_left = ref_block_start_x < ref_start_x;
+  const bool extend_right = ref_block_end_x > ref_last_x;
+  const bool out_of_left = copy_start_x > ref_block_end_x;
+  const bool out_of_right = copy_end_x < ref_block_start_x;
+  if (out_of_left || out_of_right) {
+    const int ref_x = out_of_left ? copy_start_x : copy_end_x;
+    Pixel* buf_ptr = block_head;
+    for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+      Memset(buf_ptr, reference_block[ref_y][ref_x], block_extended_width);
+      if (ref_block_start_y + y >= ref_start_y &&
+          ref_block_start_y + y < ref_last_y) {
+        ++ref_y;
+      }
+      buf_ptr += convolve_buffer_stride;
+    }
+  } else {
+    Pixel* buf_ptr = block_head;
+    const int left_width = copy_start_x - ref_block_start_x;
+    for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+      if (extend_left) {
+        Memset(buf_ptr, reference_block[ref_y][copy_start_x], left_width);
+      }
+      memcpy(buf_ptr + left_width, &reference_block[ref_y][copy_start_x],
+             block_width * sizeof(Pixel));
+      if (extend_right) {
+        Memset(buf_ptr + left_width + block_width,
+               reference_block[ref_y][copy_end_x],
+               block_extended_width - left_width - block_width);
+      }
+      if (ref_block_start_y + y >= ref_start_y &&
+          ref_block_start_y + y < ref_last_y) {
+        ++ref_y;
+      }
+      buf_ptr += convolve_buffer_stride;
+    }
+  }
+}
+
+bool Tile::BlockInterPrediction(
+    const Block& block, const Plane plane, const int reference_frame_index,
+    const MotionVector& mv, const int x, const int y, const int width,
+    const int height, const int candidate_row, const int candidate_column,
+    uint16_t* const prediction, const bool is_compound,
+    const bool is_inter_intra, uint8_t* const dest,
+    const ptrdiff_t dest_stride) {
+  const BlockParameters& bp =
+      *block_parameters_holder_.Find(candidate_row, candidate_column);
+  int start_x;
+  int start_y;
+  int step_x;
+  int step_y;
+  ScaleMotionVector(mv, plane, reference_frame_index, x, y, &start_x, &start_y,
+                    &step_x, &step_y);
+  const int horizontal_filter_index = bp.interpolation_filter[1];
+  const int vertical_filter_index = bp.interpolation_filter[0];
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  // reference_frame_index equal to -1 indicates using current frame as
+  // reference.
+  const YuvBuffer* const reference_buffer =
+      (reference_frame_index == -1)
+          ? current_frame_.buffer()
+          : reference_frames_[reference_frame_index]->buffer();
+  const int reference_upscaled_width =
+      (reference_frame_index == -1)
+          ? MultiplyBy4(frame_header_.columns4x4)
+          : reference_frames_[reference_frame_index]->upscaled_width();
+  const int reference_height =
+      (reference_frame_index == -1)
+          ? MultiplyBy4(frame_header_.rows4x4)
+          : reference_frames_[reference_frame_index]->frame_height();
+  const int ref_start_x = 0;
+  const int ref_last_x =
+      SubsampledValue(reference_upscaled_width, subsampling_x) - 1;
+  const int ref_start_y = 0;
+  const int ref_last_y = SubsampledValue(reference_height, subsampling_y) - 1;
+
+  const bool is_scaled = (reference_frame_index != -1) &&
+                         (frame_header_.width != reference_upscaled_width ||
+                          frame_header_.height != reference_height);
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const int pixel_size = (bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  int ref_block_start_x;
+  int ref_block_start_y;
+  int ref_block_end_x;
+  const bool extend_block = GetReferenceBlockPosition(
+      reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x,
+      ref_start_y, ref_last_y, start_x, start_y, step_x, step_y,
+      reference_buffer->left_border(plane),
+      reference_buffer->right_border(plane),
+      reference_buffer->top_border(plane),
+      reference_buffer->bottom_border(plane), &ref_block_start_x,
+      &ref_block_start_y, &ref_block_end_x);
+
+  // In frame parallel mode, ensure that the reference block has been decoded
+  // and available for referencing.
+  if (reference_frame_index != -1 && frame_parallel_) {
+    int reference_y_max;
+    if (is_scaled) {
+      // TODO(vigneshv): For now, we wait for the entire reference frame to be
+      // decoded if we are using scaled references. This will eventually be
+      // fixed.
+      reference_y_max = reference_height;
+    } else {
+      reference_y_max =
+          std::min(ref_block_start_y + height + kSubPixelTaps, ref_last_y);
+      // For U and V planes with subsampling, we need to multiply
+      // reference_y_max by 2 since we only track the progress of Y planes.
+      reference_y_max = LeftShift(reference_y_max, subsampling_y);
+    }
+    if (reference_frame_progress_cache_[reference_frame_index] <
+            reference_y_max &&
+        !reference_frames_[reference_frame_index]->WaitUntil(
+            reference_y_max,
+            &reference_frame_progress_cache_[reference_frame_index])) {
+      return false;
+    }
+  }
+
+  const uint8_t* block_start = nullptr;
+  ptrdiff_t convolve_buffer_stride;
+  if (!extend_block) {
+    const YuvBuffer* const reference_buffer =
+        (reference_frame_index == -1)
+            ? current_frame_.buffer()
+            : reference_frames_[reference_frame_index]->buffer();
+    convolve_buffer_stride = reference_buffer->stride(plane);
+    if (reference_frame_index == -1 || is_scaled) {
+      block_start = reference_buffer->data(plane) +
+                    ref_block_start_y * reference_buffer->stride(plane) +
+                    ref_block_start_x * pixel_size;
+    } else {
+      block_start = reference_buffer->data(plane) +
+                    (ref_block_start_y + kConvolveBorderLeftTop) *
+                        reference_buffer->stride(plane) +
+                    (ref_block_start_x + kConvolveBorderLeftTop) * pixel_size;
+    }
+  } else {
+    // The block width can be at most 2 times as much as current
+    // block's width because of scaling.
+    auto block_extended_width = Align<ptrdiff_t>(
+        (2 * width + kConvolveBorderLeftTop + kConvolveBorderRight) *
+            pixel_size,
+        kMaxAlignment);
+    convolve_buffer_stride = block.scratch_buffer->convolve_block_buffer_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) {
+      BuildConvolveBlock<uint16_t>(
+          plane, reference_frame_index, is_scaled, height, ref_start_x,
+          ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+          ref_block_end_x, ref_block_start_y,
+          block.scratch_buffer->convolve_block_buffer.get(),
+          convolve_buffer_stride, block_extended_width);
+    } else {
+#endif
+      BuildConvolveBlock<uint8_t>(
+          plane, reference_frame_index, is_scaled, height, ref_start_x,
+          ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+          ref_block_end_x, ref_block_start_y,
+          block.scratch_buffer->convolve_block_buffer.get(),
+          convolve_buffer_stride, block_extended_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif
+    block_start = block.scratch_buffer->convolve_block_buffer.get() +
+                  (is_scaled ? 0
+                             : kConvolveBorderLeftTop * convolve_buffer_stride +
+                                   kConvolveBorderLeftTop * pixel_size);
+  }
+
+  void* const output =
+      (is_compound || is_inter_intra) ? prediction : static_cast<void*>(dest);
+  ptrdiff_t output_stride = (is_compound || is_inter_intra)
+                                ? /*prediction_stride=*/width
+                                : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // |is_inter_intra| calculations are written to the |prediction| buffer.
+  // Unlike the |is_compound| calculations the output is Pixel and not uint16_t.
+  // convolve_func() expects |output_stride| to be in bytes and not Pixels.
+  // |prediction_stride| is in units of uint16_t. Adjust |output_stride| to
+  // account for this.
+  if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+    output_stride *= 2;
+  }
+#endif
+  assert(output != nullptr);
+  if (is_scaled) {
+    dsp::ConvolveScaleFunc convolve_func = dsp_.convolve_scale[is_compound];
+    assert(convolve_func != nullptr);
+
+    convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+                  vertical_filter_index, start_x, start_y, step_x, step_y,
+                  width, height, output, output_stride);
+  } else {
+    const int horizontal_filter_id = (start_x >> 6) & kSubPixelMask;
+    const int vertical_filter_id = (start_y >> 6) & kSubPixelMask;
+
+    dsp::ConvolveFunc convolve_func =
+        dsp_.convolve[reference_frame_index == -1][is_compound]
+                     [vertical_filter_id != 0][horizontal_filter_id != 0];
+    assert(convolve_func != nullptr);
+
+    convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+                  vertical_filter_index, horizontal_filter_id,
+                  vertical_filter_id, width, height, output, output_stride);
+  }
+  return true;
+}
+
+bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
+                            const int index, const int block_start_x,
+                            const int block_start_y, const int width,
+                            const int height, GlobalMotion* const warp_params,
+                            const bool is_compound, const bool is_inter_intra,
+                            uint8_t* const dest, const ptrdiff_t dest_stride) {
+  assert(width >= 8 && height >= 8);
+  const BlockParameters& bp = *block.bp;
+  const int reference_frame_index =
+      frame_header_.reference_frame_index[bp.reference_frame[index] -
+                                          kReferenceFrameLast];
+  const uint8_t* const source =
+      reference_frames_[reference_frame_index]->buffer()->data(plane);
+  ptrdiff_t source_stride =
+      reference_frames_[reference_frame_index]->buffer()->stride(plane);
+  const int source_width =
+      reference_frames_[reference_frame_index]->buffer()->width(plane);
+  const int source_height =
+      reference_frames_[reference_frame_index]->buffer()->height(plane);
+  uint16_t* const prediction = block.scratch_buffer->prediction_buffer[index];
+
+  // In frame parallel mode, ensure that the reference block has been decoded
+  // and available for referencing.
+  if (frame_parallel_) {
+    int reference_y_max = -1;
+    // Find out the maximum y-coordinate for warping.
+    for (int start_y = block_start_y; start_y < block_start_y + height;
+         start_y += 8) {
+      for (int start_x = block_start_x; start_x < block_start_x + width;
+           start_x += 8) {
+        const int src_x = (start_x + 4) << subsampling_x_[plane];
+        const int src_y = (start_y + 4) << subsampling_y_[plane];
+        const int dst_y = src_x * warp_params->params[4] +
+                          src_y * warp_params->params[5] +
+                          warp_params->params[1];
+        const int y4 = dst_y >> subsampling_y_[plane];
+        const int iy4 = y4 >> kWarpedModelPrecisionBits;
+        reference_y_max = std::max(iy4 + 8, reference_y_max);
+      }
+    }
+    // For U and V planes with subsampling, we need to multiply reference_y_max
+    // by 2 since we only track the progress of Y planes.
+    reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]);
+    if (reference_frame_progress_cache_[reference_frame_index] <
+            reference_y_max &&
+        !reference_frames_[reference_frame_index]->WaitUntil(
+            reference_y_max,
+            &reference_frame_progress_cache_[reference_frame_index])) {
+      return false;
+    }
+  }
+  if (is_compound) {
+    dsp_.warp_compound(source, source_stride, source_width, source_height,
+                       warp_params->params, subsampling_x_[plane],
+                       subsampling_y_[plane], block_start_x, block_start_y,
+                       width, height, warp_params->alpha, warp_params->beta,
+                       warp_params->gamma, warp_params->delta, prediction,
+                       /*prediction_stride=*/width);
+  } else {
+    void* const output = is_inter_intra ? static_cast<void*>(prediction) : dest;
+    ptrdiff_t output_stride =
+        is_inter_intra ? /*prediction_stride=*/width : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    // |is_inter_intra| calculations are written to the |prediction| buffer.
+    // Unlike the |is_compound| calculations the output is Pixel and not
+    // uint16_t. warp_clip() expects |output_stride| to be in bytes and not
+    // Pixels. |prediction_stride| is in units of uint16_t. Adjust
+    // |output_stride| to account for this.
+    if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+      output_stride *= 2;
+    }
+#endif
+    dsp_.warp(source, source_stride, source_width, source_height,
+              warp_params->params, subsampling_x_[plane], subsampling_y_[plane],
+              block_start_x, block_start_y, width, height, warp_params->alpha,
+              warp_params->beta, warp_params->gamma, warp_params->delta, output,
+              output_stride);
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/tile.cc b/src/tile/tile.cc
new file mode 100644
index 0000000..ee48f17
--- /dev/null
+++ b/src/tile/tile.cc
@@ -0,0 +1,2573 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/motion_vector.h"
+#include "src/reconstruction.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+// Range above kNumQuantizerBaseLevels which the exponential golomb coding
+// process is activated.
+constexpr int kQuantizerCoefficientBaseRange = 12;
+constexpr int kNumQuantizerBaseLevels = 2;
+constexpr int kCoeffBaseRangeMaxIterations =
+    kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
+constexpr int kEntropyContextLeft = 0;
+constexpr int kEntropyContextTop = 1;
+
+constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3},
+                                                     {2, 4, 4, 4, 5},
+                                                     {2, 4, 4, 4, 5},
+                                                     {2, 4, 4, 4, 5},
+                                                     {3, 5, 5, 5, 6}};
+
+// The space complexity of DFS is O(branching_factor * max_depth). For the
+// parameter tree, branching_factor = 4 (there could be up to 4 children for
+// every node) and max_depth (excluding the root) = 5 (to go from a 128x128
+// block all the way to a 4x4 block). The worse-case stack size is 16, by
+// counting the number of 'o' nodes in the diagram:
+//
+//   |                    128x128  The highest level (corresponding to the
+//   |                             root of the tree) has no node in the stack.
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  64x64
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  32x32    Higher levels have three nodes in the stack,
+//   |                             because we pop one node off the stack before
+//   |-----------------+           pushing its four children onto the stack.
+//   |     |     |     |
+//   |     o     o     o  16x16
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  8x8
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   o     o     o     o  4x4      Only the lowest level has four nodes in the
+//                                 stack.
+constexpr int kDfsStackSize = 16;
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+    BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
+    BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+constexpr PredictionMode
+    kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = {
+        kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+        kPredictionModeD157, kPredictionModeDc};
+
+// Mask used to determine the index for mode_deltas lookup.
+constexpr BitMaskSet kPredictionModeDeltasMask(
+    kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
+    kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
+    kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
+    kPredictionModeNearNewMv, kPredictionModeNewNearMv,
+    kPredictionModeNewNewMv);
+
+// This is computed as:
+// min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
+constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
+    0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6};
+
+/* clang-format off */
+constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
+    {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0},
+     {0, 0, 0, 0, 0}},
+    {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+     {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+    {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+     {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
+/* clang-format on */
+
+// Extended the table size from 3 to 16 by repeating the last element to avoid
+// the clips to row or column indices.
+constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
+    26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
+
+constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
+    kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+    kPredictionModeSmooth};
+
+// Number of horizontal luma samples before intra block copy can be used.
+constexpr int kIntraBlockCopyDelayPixels = 256;
+// Number of 64 by 64 blocks before intra block copy can be used.
+constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64;
+
+// Index [i][j] corresponds to the transform size of width 1 << (i + 2) and
+// height 1 << (j + 2).
+constexpr TransformSize k4x4SizeToTransformSize[5][5] = {
+    {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+     kNumTransformSizes, kNumTransformSizes},
+    {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+     kTransformSize8x32, kNumTransformSizes},
+    {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16,
+     kTransformSize16x32, kTransformSize16x64},
+    {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16,
+     kTransformSize32x32, kTransformSize32x64},
+    {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16,
+     kTransformSize64x32, kTransformSize64x64}};
+
+// Defined in section 9.3 of the spec.
+constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = {
+    kTransformTypeDctDct,   kTransformTypeDctAdst,  kTransformTypeAdstDct,
+    kTransformTypeDctDct,   kTransformTypeAdstAdst, kTransformTypeDctAdst,
+    kTransformTypeAdstDct,  kTransformTypeAdstDct,  kTransformTypeDctAdst,
+    kTransformTypeAdstAdst, kTransformTypeDctAdst,  kTransformTypeAdstDct,
+    kTransformTypeAdstAdst, kTransformTypeDctDct};
+
+// Defined in section 5.11.47 of the spec. This array does not contain an entry
+// for kTransformSetDctOnly, so the first dimension needs to be
+// |kNumTransformSets| - 1.
+constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] =
+    {{kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+      kTransformTypeIdentityDct, kTransformTypeDctIdentity,
+      kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+     {kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+      kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+     {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+      kTransformTypeDctIdentity, kTransformTypeIdentityAdst,
+      kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst,
+      kTransformTypeFlipadstIdentity, kTransformTypeDctDct,
+      kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+      kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+      kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+      kTransformTypeAdstFlipadst},
+     {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+      kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst,
+      kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+      kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+      kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+      kTransformTypeAdstFlipadst},
+     {kTransformTypeIdentityIdentity, kTransformTypeDctDct}};
+
+// Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively.
+constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32};
+
+// This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x
+// transforms replaced with *x32 and 32x* respectively.
+constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+    kTransformSize32x32};
+
+// ith entry of this array is computed as:
+// DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) +
+//           TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) +
+//           1)
+constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
+    0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4};
+
+constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
+
+constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
+
+// Maps compound prediction modes into single modes. For e.g.
+// kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
+// and kPredictionModeNewMv for index 1. It is used to simplify the logic in
+// AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec.
+constexpr PredictionMode
+    kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = {
+        {kPredictionModeNearestMv, kPredictionModeNearestMv},
+        {kPredictionModeNearMv, kPredictionModeNearMv},
+        {kPredictionModeNearestMv, kPredictionModeNewMv},
+        {kPredictionModeNewMv, kPredictionModeNearestMv},
+        {kPredictionModeNearMv, kPredictionModeNewMv},
+        {kPredictionModeNewMv, kPredictionModeNearMv},
+        {kPredictionModeGlobalMv, kPredictionModeGlobalMv},
+        {kPredictionModeNewMv, kPredictionModeNewMv},
+};
+PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
+  if (y_mode < kPredictionModeNearestNearestMv) {
+    return y_mode;
+  }
+  const int lookup_index = y_mode - kPredictionModeNearestNearestMv;
+  assert(lookup_index >= 0);
+  return kCompoundToSinglePredictionMode[lookup_index][index];
+}
+
+// log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
+// dqDenom is always a power of two and hence right shift can be used instead of
+// division.
+constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
+
+// Returns the minimum of |length| or |max|-|start|. This is used to clamp array
+// indices when accessing arrays whose bound is equal to |max|.
+int GetNumElements(int length, int start, int max) {
+  return std::min(length, max - start);
+}
+
+template <typename T>
+void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  // Specialize all columns cases (values in kTransformWidth4x4[]) for better
+  // performance.
+  switch (columns) {
+    case 1:
+      MemSetBlock<T>(rows, 1, value, dst, stride);
+      break;
+    case 2:
+      MemSetBlock<T>(rows, 2, value, dst, stride);
+      break;
+    case 4:
+      MemSetBlock<T>(rows, 4, value, dst, stride);
+      break;
+    case 8:
+      MemSetBlock<T>(rows, 8, value, dst, stride);
+      break;
+    default:
+      assert(columns == 16);
+      MemSetBlock<T>(rows, 16, value, dst, stride);
+      break;
+  }
+}
+
+void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
+                      TransformType tx_type,
+                      TransformType transform_types[32][32]) {
+  const int y_offset = y4 - block.row4x4;
+  const int x_offset = x4 - block.column4x4;
+  TransformType* const dst = &transform_types[y_offset][x_offset];
+  SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
+}
+
+void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
+                         const MotionVector& mv_to_store, ptrdiff_t stride,
+                         int rows, int columns,
+                         ReferenceFrameType* reference_frame_row_start,
+                         MotionVector* mv) {
+  static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
+  do {
+    // Don't switch the following two memory setting functions.
+    // Some ARM CPUs are quite sensitive to the order.
+    memset(reference_frame_row_start, reference_frame_to_store, columns);
+    std::fill(mv, mv + columns, mv_to_store);
+    reference_frame_row_start += stride;
+    mv += stride;
+  } while (--rows != 0);
+}
+
+// Inverse transform process assumes that the quantized coefficients are stored
+// as a virtual 2d array of size |tx_width| x tx_height. If transform width is
+// 64, then this assumption is broken because the scan order used for populating
+// the coefficients for such transforms is the same as the one used for
+// corresponding transform with width 32 (e.g. the scan order used for 64x16 is
+// the same as the one used for 32x16). So we must restore the coefficients to
+// their correct positions and clean the positions they occupied.
+template <typename ResidualType>
+void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
+                                  ResidualType* residual) {
+  if (tx_width != 64) return;
+  const int rows = clamped_tx_height - 2;
+  auto* src = residual + 32 * rows;
+  residual += 64 * rows;
+  // Process 2 rows in each loop in reverse order to avoid overwrite.
+  int x = rows >> 1;
+  do {
+    // The 2 rows can be processed in order.
+    memcpy(residual, src, 32 * sizeof(src[0]));
+    memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+    memset(src + 32, 0, 32 * sizeof(src[0]));
+    src -= 64;
+    residual -= 128;
+  } while (--x);
+  // Process the second row. The first row is already correct.
+  memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+  memset(src + 32, 0, 32 * sizeof(src[0]));
+}
+
+void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
+  // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
+  // and 5.11.54).
+  constexpr int kMvBorder4x4 = 4;
+  const int row_border = kMvBorder4x4 + block.height4x4;
+  const int column_border = kMvBorder4x4 + block.width4x4;
+  const int macroblocks_to_top_edge = -block.row4x4;
+  const int macroblocks_to_bottom_edge =
+      block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
+  const int macroblocks_to_left_edge = -block.column4x4;
+  const int macroblocks_to_right_edge =
+      block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
+  min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
+  min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
+  max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
+  max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
+}
+
+// Section 8.3.2 in the spec, under coeff_base_eob.
+int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
+  if (index == 0) return 0;
+  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+  const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+  const int tx_height = kTransformHeight[adjusted_tx_size];
+  if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
+  if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
+  return 3;
+}
+
+// Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
+// on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
+// the end of block case.
+int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
+                                TransformClass tx_class) {
+  if (pos == 0) return 0;
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  const int row = pos >> adjusted_tx_width_log2;
+  const int column = pos & (tx_width - 1);
+  // This return statement is equivalent to:
+  // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
+  //         (tx_class == kTransformClassHorizontal && column == 0) ||
+  //         (tx_class == kTransformClassVertical && row == 0))
+  //            ? 7
+  //            : 14;
+  return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
+                 static_cast<int>((row | column) < 2)) |
+                (tx_class & static_cast<int>(column == 0)) |
+                ((tx_class >> 1) & static_cast<int>(row == 0)));
+}
+
+}  // namespace
+
+Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
+           const ObuSequenceHeader& sequence_header,
+           const ObuFrameHeader& frame_header,
+           RefCountedBuffer* const current_frame, const DecoderState& state,
+           FrameScratchBuffer* const frame_scratch_buffer,
+           const WedgeMaskArray& wedge_masks,
+           const QuantizerMatrix& quantizer_matrix,
+           SymbolDecoderContext* const saved_symbol_decoder_context,
+           const SegmentationMap* prev_segment_ids,
+           PostFilter* const post_filter, const dsp::Dsp* const dsp,
+           ThreadPool* const thread_pool,
+           BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+           bool use_intra_prediction_buffer)
+    : number_(tile_number),
+      row_(number_ / frame_header.tile_info.tile_columns),
+      column_(number_ % frame_header.tile_info.tile_columns),
+      data_(data),
+      size_(size),
+      read_deltas_(false),
+      subsampling_x_{0, sequence_header.color_config.subsampling_x,
+                     sequence_header.color_config.subsampling_x},
+      subsampling_y_{0, sequence_header.color_config.subsampling_y,
+                     sequence_header.color_config.subsampling_y},
+      current_quantizer_index_(frame_header.quantizer.base_index),
+      sequence_header_(sequence_header),
+      frame_header_(frame_header),
+      reference_frame_sign_bias_(state.reference_frame_sign_bias),
+      reference_frames_(state.reference_frame),
+      motion_field_(frame_scratch_buffer->motion_field),
+      reference_order_hint_(state.reference_order_hint),
+      wedge_masks_(wedge_masks),
+      quantizer_matrix_(quantizer_matrix),
+      reader_(data_, size_, frame_header_.enable_cdf_update),
+      symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
+      saved_symbol_decoder_context_(saved_symbol_decoder_context),
+      prev_segment_ids_(prev_segment_ids),
+      dsp_(*dsp),
+      post_filter_(*post_filter),
+      block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
+      quantizer_(sequence_header_.color_config.bitdepth,
+                 &frame_header_.quantizer),
+      residual_size_((sequence_header_.color_config.bitdepth == 8)
+                         ? sizeof(int16_t)
+                         : sizeof(int32_t)),
+      intra_block_copy_lag_(
+          frame_header_.allow_intrabc
+              ? (sequence_header_.use_128x128_superblock ? 3 : 5)
+              : 1),
+      current_frame_(*current_frame),
+      cdef_index_(frame_scratch_buffer->cdef_index),
+      inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+      thread_pool_(thread_pool),
+      residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
+      tile_scratch_buffer_pool_(
+          &frame_scratch_buffer->tile_scratch_buffer_pool),
+      pending_tiles_(pending_tiles),
+      frame_parallel_(frame_parallel),
+      use_intra_prediction_buffer_(use_intra_prediction_buffer),
+      intra_prediction_buffer_(
+          use_intra_prediction_buffer_
+              ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
+              : nullptr) {
+  row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
+  row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
+  column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
+  column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1];
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()];
+  superblock_rows_ =
+      (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2;
+  superblock_columns_ =
+      (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
+      block_width4x4_log2;
+  // If |split_parse_and_decode_| is true, we do the necessary setup for
+  // splitting the parsing and the decoding steps. This is done in the following
+  // two cases:
+  //  1) If there is multi-threading within a tile (this is done if
+  //     |thread_pool_| is not nullptr and if there are at least as many
+  //     superblock columns as |intra_block_copy_lag_|).
+  //  2) If |frame_parallel| is true.
+  split_parse_and_decode_ = (thread_pool_ != nullptr &&
+                             superblock_columns_ > intra_block_copy_lag_) ||
+                            frame_parallel;
+  if (frame_parallel_) {
+    reference_frame_progress_cache_.fill(INT_MIN);
+  }
+  memset(delta_lf_, 0, sizeof(delta_lf_));
+  delta_lf_all_zero_ = true;
+  const YuvBuffer& buffer = post_filter_.frame_buffer();
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    // Verify that the borders are big enough for Reconstruct(). max_tx_length
+    // is the maximum value of tx_width and tx_height for the plane.
+    const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
+    // Reconstruct() may overwrite on the right. Since the right border of a
+    // row is followed in memory by the left border of the next row, the
+    // number of extra pixels to the right of a row is at least the sum of the
+    // left and right borders.
+    //
+    // Note: This assertion actually checks the sum of the left and right
+    // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
+    // and vertically shifted version of |buffer|. Since the sum of the left and
+    // right borders is not changed by the shift, we can just check the sum of
+    // the left and right borders of |buffer|.
+    assert(buffer.left_border(plane) + buffer.right_border(plane) >=
+           max_tx_length - 1);
+    // Reconstruct() may overwrite on the bottom. We need an extra border row
+    // on the bottom because we need the left border of that row.
+    //
+    // Note: This assertion checks the bottom border of
+    // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
+    // shift that the PostFilter constructor applied to |buffer| and reduce the
+    // bottom border by that amount.
+#ifndef NDEBUG
+    const int vertical_shift = static_cast<int>(
+        (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
+        buffer.stride(plane));
+    const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
+    assert(bottom_border >= max_tx_length);
+#endif
+    // In AV1, a transform block of height H starts at a y coordinate that is
+    // a multiple of H. If a transform block at the bottom of the frame has
+    // height H, then Reconstruct() will write up to the row with index
+    // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
+    // rows Reconstruct() may write to is
+    // Align(buffer.height(plane), max_tx_length).
+    buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
+                         buffer.stride(plane),
+                         post_filter_.GetUnfilteredBuffer(plane));
+    const int plane_height =
+        SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+    deblock_row_limit_[plane] =
+        std::min(frame_header_.rows4x4, DivideBy4(plane_height + 3)
+                                            << subsampling_y_[plane]);
+    const int plane_width =
+        SubsampledValue(frame_header_.width, subsampling_x_[plane]);
+    deblock_column_limit_[plane] =
+        std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3)
+                                               << subsampling_x_[plane]);
+  }
+}
+
+bool Tile::Init() {
+  assert(coefficient_levels_.size() == dc_categories_.size());
+  for (size_t i = 0; i < coefficient_levels_.size(); ++i) {
+    const int contexts_per_plane = (i == kEntropyContextLeft)
+                                       ? frame_header_.rows4x4
+                                       : frame_header_.columns4x4;
+    if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) {
+      LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i);
+      return false;
+    }
+    if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) {
+      LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i);
+      return false;
+    }
+  }
+  if (split_parse_and_decode_) {
+    assert(residual_buffer_pool_ != nullptr);
+    if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_,
+                                         /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed.");
+      return false;
+    }
+  } else {
+    // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
+    // checks when parsing quantized coefficients.
+    residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
+        32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
+    if (residual_buffer_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
+      return false;
+    }
+    prediction_parameters_.reset(new (std::nothrow) PredictionParameters());
+    if (prediction_parameters_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed.");
+      return false;
+    }
+  }
+  if (frame_header_.use_ref_frame_mvs) {
+    assert(sequence_header_.enable_order_hint);
+    SetupMotionField(frame_header_, current_frame_, reference_frames_,
+                     row4x4_start_, row4x4_end_, column4x4_start_,
+                     column4x4_end_, &motion_field_);
+  }
+  ResetLoopRestorationParams();
+  return true;
+}
+
+template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+bool Tile::ProcessSuperBlockRow(int row4x4,
+                                TileScratchBuffer* const scratch_buffer) {
+  if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
+  assert(scratch_buffer != nullptr);
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
+       column4x4 += block_width4x4) {
+    if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer,
+                           processing_mode)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+  }
+  if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
+    SaveSymbolDecoderContext();
+  }
+  if (processing_mode == kProcessingModeDecodeOnly ||
+      processing_mode == kProcessingModeParseAndDecode) {
+    PopulateIntraPredictionBuffer(row4x4);
+  }
+  return true;
+}
+
+// Used in frame parallel mode. The symbol decoder context need not be saved in
+// this case since it was done when parsing was complete.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+// Used in non frame parallel mode.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+
+void Tile::SaveSymbolDecoderContext() {
+  if (frame_header_.enable_frame_end_update_cdf &&
+      number_ == frame_header_.tile_info.context_update_id) {
+    *saved_symbol_decoder_context_ = symbol_decoder_context_;
+  }
+}
+
+bool Tile::ParseAndDecode() {
+  // If this is the main thread, we build the loop filter bit masks when parsing
+  // so that it happens in the current thread. This ensures that the main thread
+  // does as much work as possible.
+  if (split_parse_and_decode_) {
+    if (!ThreadedParseAndDecode()) return false;
+    SaveSymbolDecoderContext();
+    return true;
+  }
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    pending_tiles_->Decrement(false);
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4) {
+    if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+            row4x4, scratch_buffer.get())) {
+      pending_tiles_->Decrement(false);
+      return false;
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  pending_tiles_->Decrement(true);
+  return true;
+}
+
+bool Tile::Parse() {
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4) {
+    if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  SaveSymbolDecoderContext();
+  return true;
+}
+
+bool Tile::Decode(
+    std::mutex* const mutex, int* const superblock_row_progress,
+    std::condition_variable* const superblock_row_progress_condvar) {
+  const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header_.use_128x128_superblock ? 5 : 4;
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
+       row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
+    if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
+    }
+    if (post_filter_.DoDeblock()) {
+      // Apply vertical deblock filtering for all the columns in this tile
+      // except for the first 64 columns.
+      post_filter_.ApplyDeblockFilter(
+          kLoopFilterTypeVertical, row4x4,
+          column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
+          block_width4x4);
+      // If this is the first superblock row of the tile, then we cannot apply
+      // horizontal deblocking here since we don't know if the top row is
+      // available. So it will be done by the calling thread in that case.
+      if (row4x4 != row4x4_start_) {
+        // Apply horizontal deblock filtering for all the columns in this tile
+        // except for the first and the last 64 columns.
+        // Note about the last tile of each row: For the last tile,
+        // column4x4_end may not be a multiple of 16. In that case it is still
+        // okay to simply subtract 16 since ApplyDeblockFilter() will only do
+        // the filters in increments of 64 columns (or 32 columns for chroma
+        // with subsampling).
+        post_filter_.ApplyDeblockFilter(
+            kLoopFilterTypeHorizontal, row4x4,
+            column4x4_start_ + kNum4x4InLoopFilterUnit,
+            column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
+      }
+    }
+    bool notify;
+    {
+      std::unique_lock<std::mutex> lock(*mutex);
+      notify = ++superblock_row_progress[index] ==
+               frame_header_.tile_info.tile_columns;
+    }
+    if (notify) {
+      // We are done decoding this superblock row. Notify the post filtering
+      // thread.
+      superblock_row_progress_condvar[index].notify_one();
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  return true;
+}
+
+bool Tile::ThreadedParseAndDecode() {
+  {
+    std::lock_guard<std::mutex> lock(threading_.mutex);
+    if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
+      pending_tiles_->Decrement(false);
+      LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
+      return false;
+    }
+    // Account for the parsing job.
+    ++threading_.pending_jobs;
+  }
+
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+
+  // Begin parsing.
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    pending_tiles_->Decrement(false);
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4, ++row_index) {
+    for (int column4x4 = column4x4_start_, column_index = 0;
+         column4x4 < column4x4_end_;
+         column4x4 += block_width4x4, ++column_index) {
+      if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4,
+                             scratch_buffer.get(), kProcessingModeParseOnly)) {
+        std::lock_guard<std::mutex> lock(threading_.mutex);
+        threading_.abort = true;
+        break;
+      }
+      std::unique_lock<std::mutex> lock(threading_.mutex);
+      if (threading_.abort) break;
+      threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed;
+      // Schedule the decoding of this superblock if it is allowed.
+      if (CanDecode(row_index, column_index)) {
+        ++threading_.pending_jobs;
+        threading_.sb_state[row_index][column_index] =
+            kSuperBlockStateScheduled;
+        lock.unlock();
+        thread_pool_->Schedule(
+            [this, row_index, column_index, block_width4x4]() {
+              DecodeSuperBlock(row_index, column_index, block_width4x4);
+            });
+      }
+    }
+    std::lock_guard<std::mutex> lock(threading_.mutex);
+    if (threading_.abort) break;
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+
+  // We are done parsing. We can return here since the calling thread will make
+  // sure that it waits for all the superblocks to be decoded.
+  //
+  // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+  // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+  // is called.
+  threading_.mutex.lock();
+  const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+  const bool job_succeeded = !threading_.abort;
+  threading_.mutex.unlock();
+  if (no_pending_jobs) {
+    // We are done parsing and decoding this tile.
+    pending_tiles_->Decrement(job_succeeded);
+  }
+  return job_succeeded;
+}
+
+bool Tile::CanDecode(int row_index, int column_index) const {
+  assert(row_index >= 0);
+  assert(column_index >= 0);
+  // If |threading_.sb_state[row_index][column_index]| is not equal to
+  // kSuperBlockStateParsed, then return false. This is ok because if
+  // |threading_.sb_state[row_index][column_index]| is equal to:
+  //   kSuperBlockStateNone - then the superblock is not yet parsed.
+  //   kSuperBlockStateScheduled - then the superblock is already scheduled for
+  //                               decode.
+  //   kSuperBlockStateDecoded - then the superblock has already been decoded.
+  if (row_index >= superblock_rows_ || column_index >= superblock_columns_ ||
+      threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) {
+    return false;
+  }
+  // First superblock has no dependencies.
+  if (row_index == 0 && column_index == 0) {
+    return true;
+  }
+  // Superblocks in the first row only depend on the superblock to the left of
+  // it.
+  if (row_index == 0) {
+    return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded;
+  }
+  // All other superblocks depend on superblock to the left of it (if one
+  // exists) and superblock to the top right with a lag of
+  // |intra_block_copy_lag_| (if one exists).
+  const int top_right_column_index =
+      std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1);
+  return threading_.sb_state[row_index - 1][top_right_column_index] ==
+             kSuperBlockStateDecoded &&
+         (column_index == 0 ||
+          threading_.sb_state[row_index][column_index - 1] ==
+              kSuperBlockStateDecoded);
+}
+
+void Tile::DecodeSuperBlock(int row_index, int column_index,
+                            int block_width4x4) {
+  const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
+  const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  bool ok = scratch_buffer != nullptr;
+  if (ok) {
+    ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4,
+                           scratch_buffer.get(), kProcessingModeDecodeOnly);
+    tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  }
+  std::unique_lock<std::mutex> lock(threading_.mutex);
+  if (ok) {
+    threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded;
+    // Candidate rows and columns that we could potentially begin the decoding
+    // (if it is allowed to do so). The candidates are:
+    //   1) The superblock to the bottom-left of the current superblock with a
+    //   lag of |intra_block_copy_lag_| (or the beginning of the next superblock
+    //   row in case there are less than |intra_block_copy_lag_| superblock
+    //   columns in the Tile).
+    //   2) The superblock to the right of the current superblock.
+    const int candidate_row_indices[] = {row_index + 1, row_index};
+    const int candidate_column_indices[] = {
+        std::max(0, column_index - intra_block_copy_lag_), column_index + 1};
+    for (size_t i = 0; i < std::extent<decltype(candidate_row_indices)>::value;
+         ++i) {
+      const int candidate_row_index = candidate_row_indices[i];
+      const int candidate_column_index = candidate_column_indices[i];
+      if (!CanDecode(candidate_row_index, candidate_column_index)) {
+        continue;
+      }
+      ++threading_.pending_jobs;
+      threading_.sb_state[candidate_row_index][candidate_column_index] =
+          kSuperBlockStateScheduled;
+      lock.unlock();
+      thread_pool_->Schedule([this, candidate_row_index, candidate_column_index,
+                              block_width4x4]() {
+        DecodeSuperBlock(candidate_row_index, candidate_column_index,
+                         block_width4x4);
+      });
+      lock.lock();
+    }
+  } else {
+    threading_.abort = true;
+  }
+  // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+  // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+  // is called.
+  const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+  const bool job_succeeded = !threading_.abort;
+  lock.unlock();
+  if (no_pending_jobs) {
+    // We are done parsing and decoding this tile.
+    pending_tiles_->Decrement(job_succeeded);
+  }
+}
+
+void Tile::PopulateIntraPredictionBuffer(int row4x4) {
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
+    return;
+  }
+  const size_t pixel_size =
+      (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                   : sizeof(uint16_t));
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    const int row_to_copy =
+        (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
+    const size_t pixels_to_copy =
+        (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
+         subsampling_x_[plane]) *
+        pixel_size;
+    const size_t column_start =
+        MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
+    void* start;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (sequence_header_.color_config.bitdepth > 8) {
+      Array2DView<uint16_t> buffer(
+          buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+          reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+      start = &buffer[row_to_copy][column_start];
+    } else  // NOLINT
+#endif
+    {
+      start = &buffer_[plane][row_to_copy][column_start];
+    }
+    memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
+           start, pixels_to_copy);
+  }
+}
+
+int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
+                                     TransformSize tx_size, int x4, int y4,
+                                     int w4, int h4) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const BlockSize plane_size = block.residual_size[plane];
+  const int block_width = kBlockWidthPixels[plane_size];
+  const int block_height = kBlockHeightPixels[plane_size];
+
+  int top = 0;
+  int left = 0;
+  const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+  const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+  if (plane == kPlaneY) {
+    if (block_width == tx_width && block_height == tx_height) return 0;
+    const uint8_t* coefficient_levels =
+        &coefficient_levels_[kEntropyContextTop][plane][x4];
+    for (int i = 0; i < num_top_elements; ++i) {
+      top = std::max(top, static_cast<int>(coefficient_levels[i]));
+    }
+    coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+    for (int i = 0; i < num_left_elements; ++i) {
+      left = std::max(left, static_cast<int>(coefficient_levels[i]));
+    }
+    assert(top <= 4);
+    assert(left <= 4);
+    // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec
+    // for top and left.
+    return kAllZeroContextsByTopLeft[top][left];
+  }
+  const uint8_t* coefficient_levels =
+      &coefficient_levels_[kEntropyContextTop][plane][x4];
+  const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+  for (int i = 0; i < num_top_elements; ++i) {
+    top |= coefficient_levels[i];
+    top |= dc_categories[i];
+  }
+  coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+  dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+  for (int i = 0; i < num_left_elements; ++i) {
+    left |= coefficient_levels[i];
+    left |= dc_categories[i];
+  }
+  return static_cast<int>(top != 0) + static_cast<int>(left != 0) + 7 +
+         3 * static_cast<int>(block_width * block_height >
+                              tx_width * tx_height);
+}
+
+TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const {
+  const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size];
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly;
+  if (is_inter) {
+    if (frame_header_.reduced_tx_set ||
+        tx_size_square_max == kTransformSize32x32) {
+      return kTransformSetInter3;
+    }
+    if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2;
+    return kTransformSetInter1;
+  }
+  if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly;
+  if (frame_header_.reduced_tx_set ||
+      tx_size_square_min == kTransformSize16x16) {
+    return kTransformSetIntra2;
+  }
+  return kTransformSetIntra1;
+}
+
+TransformType Tile::ComputeTransformType(const Block& block, Plane plane,
+                                         TransformSize tx_size, int block_x,
+                                         int block_y) {
+  const BlockParameters& bp = *block.bp;
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  if (frame_header_.segmentation.lossless[bp.segment_id] ||
+      tx_size_square_max == kTransformSize64x64) {
+    return kTransformTypeDctDct;
+  }
+  if (plane == kPlaneY) {
+    return transform_types_[block_y - block.row4x4][block_x - block.column4x4];
+  }
+  const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+  TransformType tx_type;
+  if (bp.is_inter) {
+    const int x4 =
+        std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]);
+    const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]);
+    tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4];
+  } else {
+    tx_type = kModeToTransformType[bp.uv_mode];
+  }
+  return kTransformTypeInSetMask[tx_set].Contains(tx_type)
+             ? tx_type
+             : kTransformTypeDctDct;
+}
+
+void Tile::ReadTransformType(const Block& block, int x4, int y4,
+                             TransformSize tx_size) {
+  BlockParameters& bp = *block.bp;
+  const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+
+  TransformType tx_type = kTransformTypeDctDct;
+  if (tx_set != kTransformSetDctOnly &&
+      frame_header_.segmentation.qindex[bp.segment_id] > 0) {
+    const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set);
+    const int cdf_tx_size_index =
+        TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]);
+    uint16_t* cdf;
+    if (bp.is_inter) {
+      cdf = symbol_decoder_context_
+                .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
+      switch (tx_set) {
+        case kTransformSetInter1:
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
+          break;
+        case kTransformSetInter2:
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
+          break;
+        default:
+          assert(tx_set == kTransformSetInter3);
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
+          break;
+      }
+    } else {
+      const PredictionMode intra_direction =
+          block.bp->prediction_parameters->use_filter_intra
+              ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters
+                                                     ->filter_intra_mode]
+              : bp.y_mode;
+      cdf =
+          symbol_decoder_context_
+              .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
+      assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
+      tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
+                                               ? reader_.ReadSymbol<7>(cdf)
+                                               : reader_.ReadSymbol<5>(cdf));
+    }
+
+    // This array does not contain an entry for kTransformSetDctOnly, so the
+    // first dimension needs to be offset by 1.
+    tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
+  }
+  SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size],
+                   kTransformHeight4x4[tx_size], tx_type, transform_types_);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the two right neighbors and the
+// one bottom-right neighbor may be out of boundary. We don't check the right
+// boundary for them, because the out of boundary neighbors project to positions
+// above the diagonal line which goes through the current coefficient and these
+// positions are still all 0s according to the diagonal scan order.
+template <typename ResidualType>
+void Tile::ReadCoeffBase2D(
+    const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  for (int i = eob - 2; i >= 1; --i) {
+    const uint16_t pos = scan[i];
+    const int row = pos >> adjusted_tx_width_log2;
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
+                             levels[tx_width + 1] + levels[2] +
+                             levels[MultiplyBy2(tx_width)];
+    const int context =
+        ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+        kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      int context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                          quantized[tx_width] +       // {1, 0}
+                                          quantized[tx_width + 1]));  // {1, 1}
+      context += 14 >> static_cast<int>((row | column) < 2);
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  }
+  // Read position 0.
+  {
+    auto* const quantized = &quantized_buffer[0];
+    int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
+    level_buffer[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      const int context =
+          std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                quantized[tx_width] +       // {1, 0}
+                                quantized[tx_width + 1]));  // {1, 1}
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  }
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the four right neighbors may be
+// out of boundary. We don't do the boundary check for the first three right
+// neighbors, because even for the transform blocks with smallest width 4, the
+// first three out of boundary neighbors project to positions left of the
+// current coefficient and these positions are still all 0s according to the
+// column scan order. However, when transform block width is 4 and the current
+// coefficient is on the right boundary, its fourth right neighbor projects to
+// the under position on the same column, which could be nonzero. Therefore, we
+// must skip the fourth right neighbor. To make it simple, for any coefficient,
+// we always do the boundary check for its fourth right neighbor.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseHorizontal(
+    const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  int i = eob - 2;
+  do {
+    const uint16_t pos = scan[i];
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum =
+        1 + (levels[1] +                                  // {0, 1}
+             levels[tx_width] +                           // {1, 0}
+             levels[2] +                                  // {0, 2}
+             levels[3] +                                  // {0, 3}
+             ((column + 4 < tx_width) ? levels[4] : 0));  // {0, 4}
+    const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+                        kCoeffBasePositionContextOffset[column];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      int context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
+                                          quantized[tx_width] +  // {1, 0}
+                                          quantized[2]));        // {0, 2}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>(column == 0);
+      }
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// Right boundary check is performed explicitly.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseVertical(
+    const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  int i = eob - 2;
+  do {
+    const uint16_t pos = scan[i];
+    const int row = pos >> adjusted_tx_width_log2;
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum =
+        1 + (((column + 1 < tx_width) ? levels[1] : 0) +  // {0, 1}
+             levels[tx_width] +                           // {1, 0}
+             levels[MultiplyBy2(tx_width)] +              // {2, 0}
+             levels[tx_width * 3] +                       // {3, 0}
+             levels[MultiplyBy4(tx_width)]);              // {4, 0}
+    const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+                        kCoeffBasePositionContextOffset[row];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
+      int context =
+          std::min(6, DivideBy2(1 + quantized_column1 +              // {0, 1}
+                                quantized[tx_width] +                // {1, 0}
+                                quantized[MultiplyBy2(tx_width)]));  // {2, 0}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>(row == 0);
+      }
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
+}
+
+int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+  // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
+  int8_t dc_sign = std::accumulate(
+      dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+  dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+  dc_sign = std::accumulate(
+      dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign);
+  // This return statement is equivalent to:
+  //   if (dc_sign < 0) return 1;
+  //   if (dc_sign > 0) return 2;
+  //   return 0;
+  // And it is better than:
+  //   return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
+  return static_cast<int>(dc_sign < 0) +
+         MultiplyBy2(static_cast<int>(dc_sign > 0));
+}
+
+void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+                              uint8_t coefficient_level, int8_t dc_category) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+  memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level,
+         num_top_elements);
+  memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category,
+         num_top_elements);
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+  const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+  memset(&coefficient_levels_[kEntropyContextLeft][plane][y4],
+         coefficient_level, num_left_elements);
+  memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category,
+         num_left_elements);
+}
+
+template <typename ResidualType, bool is_dc_coefficient>
+bool Tile::ReadSignAndApplyDequantization(
+    const uint16_t* const scan, int i, int q_value,
+    const uint8_t* const quantizer_matrix, int shift, int max_value,
+    uint16_t* const dc_sign_cdf, int8_t* const dc_category,
+    int* const coefficient_level, ResidualType* residual_buffer) {
+  const int pos = is_dc_coefficient ? 0 : scan[i];
+  // If residual_buffer[pos] is zero, then the rest of the function has no
+  // effect.
+  int level = residual_buffer[pos];
+  if (level == 0) return true;
+  const int sign = is_dc_coefficient
+                       ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
+                       : reader_.ReadBit();
+  if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
+    int length = 0;
+    bool golomb_length_bit = false;
+    do {
+      golomb_length_bit = static_cast<bool>(reader_.ReadBit());
+      ++length;
+      if (length > 20) {
+        LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length);
+        return false;
+      }
+    } while (!golomb_length_bit);
+    int x = 1;
+    for (int i = length - 2; i >= 0; --i) {
+      x = (x << 1) | reader_.ReadBit();
+    }
+    level += x - 1;
+  }
+  if (is_dc_coefficient) {
+    *dc_category = (sign != 0) ? -1 : 1;
+  }
+  level &= 0xfffff;
+  *coefficient_level += level;
+  // Apply dequantization. Step 1 of section 7.12.3 in the spec.
+  int q = q_value;
+  if (quantizer_matrix != nullptr) {
+    q = RightShiftWithRounding(q * quantizer_matrix[pos], 5);
+  }
+  // The intermediate multiplication can exceed 32 bits, so it has to be
+  // performed by promoting one of the values to int64_t.
+  int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
+  dequantized_value >>= shift;
+  // At this point:
+  //   * |dequantized_value| is always non-negative.
+  //   * |sign| can be either 0 or 1.
+  //   * min_value = -(max_value + 1).
+  // We need to apply the following:
+  // dequantized_value = sign ? -dequantized_value : dequantized_value;
+  // dequantized_value = Clip3(dequantized_value, min_value, max_value);
+  //
+  // Note that -x == ~(x - 1).
+  //
+  // Now, The above two lines can be done with a std::min and xor as follows:
+  dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
+  residual_buffer[pos] = dequantized_value;
+  return true;
+}
+
+int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
+  int level = 0;
+  for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
+    const int coeff_base_range =
+        reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
+    level += coeff_base_range;
+    if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
+  }
+  return level;
+}
+
+template <typename ResidualType>
+int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
+                                    int start_x, int start_y,
+                                    TransformSize tx_size,
+                                    TransformType* const tx_type) {
+  const int x4 = DivideBy4(start_x);
+  const int y4 = DivideBy4(start_y);
+  const int w4 = kTransformWidth4x4[tx_size];
+  const int h4 = kTransformHeight4x4[tx_size];
+  const int tx_size_context = kTransformSizeContext[tx_size];
+  int context =
+      GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4);
+  const bool all_zero = reader_.ReadSymbol(
+      symbol_decoder_context_.all_zero_cdf[tx_size_context][context]);
+  if (all_zero) {
+    if (plane == kPlaneY) {
+      SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct,
+                       transform_types_);
+    }
+    SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0);
+    // This is not used in this case, so it can be set to any value.
+    *tx_type = kNumTransformTypes;
+    return 0;
+  }
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+  const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+  const int tx_padding =
+      (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
+  auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
+  // Clear padding to avoid bottom boundary checks when parsing quantized
+  // coefficients.
+  memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
+  uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
+  memset(
+      level_buffer, 0,
+      kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
+          tx_padding);
+  const int clamped_tx_height = std::min(tx_height, 32);
+  if (plane == kPlaneY) {
+    ReadTransformType(block, x4, y4, tx_size);
+  }
+  BlockParameters& bp = *block.bp;
+  *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4);
+  const int eob_multi_size = kEobMultiSizeLookup[tx_size];
+  const PlaneType plane_type = GetPlaneType(plane);
+  const TransformClass tx_class = GetTransformClass(*tx_type);
+  context = static_cast<int>(tx_class != kTransformClass2D);
+  int eob_pt = 1;
+  switch (eob_multi_size) {
+    case 0:
+      eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
+          symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
+      break;
+    case 1:
+      eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
+          symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
+      break;
+    case 2:
+      eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
+          symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
+      break;
+    case 3:
+      eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
+          symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
+      break;
+    case 4:
+      eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
+          symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
+      break;
+    case 5:
+      eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
+          symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
+      break;
+    case 6:
+    default:
+      eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
+          symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
+      break;
+  }
+  int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
+  if (eob_pt >= 3) {
+    context = eob_pt - 3;
+    const bool eob_extra = reader_.ReadSymbol(
+        symbol_decoder_context_
+            .eob_extra_cdf[tx_size_context][plane_type][context]);
+    if (eob_extra) eob += 1 << (eob_pt - 3);
+    for (int i = 1; i < eob_pt - 2; ++i) {
+      assert(eob_pt - i >= 3);
+      assert(eob_pt <= kEobPt1024SymbolCount);
+      if (static_cast<bool>(reader_.ReadBit())) {
+        eob += 1 << (eob_pt - i - 3);
+      }
+    }
+  }
+  const uint16_t* scan = kScan[tx_class][tx_size];
+  const int clamped_tx_size_context = std::min(tx_size_context, 3);
+  auto coeff_base_range_cdf =
+      symbol_decoder_context_
+          .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
+  // Read the last coefficient.
+  {
+    context = GetCoeffBaseContextEob(tx_size, eob - 1);
+    const uint16_t pos = scan[eob - 1];
+    int level =
+        1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
+                symbol_decoder_context_
+                    .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
+    level_buffer[pos] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      level +=
+          ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
+              adjusted_tx_width_log2, pos, tx_class)]);
+    }
+    residual[pos] = level;
+  }
+  if (eob > 1) {
+    // Read all the other coefficients.
+    // Lookup used to call the right variant of ReadCoeffBase*() based on the
+    // transform class.
+    static constexpr void (Tile::*kGetCoeffBaseFunc[])(
+        const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+        int eob,
+        uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+        uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                     [kCoeffBaseRangeSymbolCount + 1],
+        ResidualType* quantized_buffer,
+        uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
+                                  &Tile::ReadCoeffBaseHorizontal<ResidualType>,
+                                  &Tile::ReadCoeffBaseVertical<ResidualType>};
+    (this->*kGetCoeffBaseFunc[tx_class])(
+        scan, tx_size, adjusted_tx_width_log2, eob,
+        symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
+        coeff_base_range_cdf, residual, level_buffer);
+  }
+  const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
+  const int current_quantizer_index = GetQIndex(
+      frame_header_.segmentation, bp.segment_id, current_quantizer_index_);
+  const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
+  const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
+  const int shift = kQuantizationShift[tx_size];
+  const uint8_t* const quantizer_matrix =
+      (frame_header_.quantizer.use_matrix &&
+       *tx_type < kTransformTypeIdentityIdentity &&
+       !frame_header_.segmentation.lossless[bp.segment_id] &&
+       frame_header_.quantizer.matrix_level[plane] < 15)
+          ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
+                             [plane_type][adjusted_tx_size]
+                                 .get()
+          : nullptr;
+  int coefficient_level = 0;
+  int8_t dc_category = 0;
+  uint16_t* const dc_sign_cdf =
+      (residual[0] != 0)
+          ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
+                x4, y4, w4, h4, plane)]
+          : nullptr;
+  assert(scan[0] == 0);
+  if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
+          scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
+          &dc_category, &coefficient_level, residual)) {
+    return -1;
+  }
+  if (eob > 1) {
+    int i = 1;
+    do {
+      if (!ReadSignAndApplyDequantization<ResidualType,
+                                          /*is_dc_coefficient=*/false>(
+              scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
+              nullptr, &coefficient_level, residual)) {
+        return -1;
+      }
+    } while (++i < eob);
+    MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
+  }
+  SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
+                     dc_category);
+  if (split_parse_and_decode_) {
+    *block.residual += tx_width * tx_height * residual_size_;
+  }
+  return eob;
+}
+
+// CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
+// |function| depending on the value of |sequence_header_.color_config.bitdepth|
+// with the variadic arguments.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+#define CALL_BITDEPTH_FUNCTION(function, ...)         \
+  do {                                                \
+    if (sequence_header_.color_config.bitdepth > 8) { \
+      function<uint16_t>(__VA_ARGS__);                \
+    } else {                                          \
+      function<uint8_t>(__VA_ARGS__);                 \
+    }                                                 \
+  } while (false)
+#else
+#define CALL_BITDEPTH_FUNCTION(function, ...) \
+  do {                                        \
+    function<uint8_t>(__VA_ARGS__);           \
+  } while (false)
+#endif
+
+bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
+                          int base_y, TransformSize tx_size, int x, int y,
+                          ProcessingMode mode) {
+  BlockParameters& bp = *block.bp;
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  const int start_x = base_x + MultiplyBy4(x);
+  const int start_y = base_y + MultiplyBy4(y);
+  const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
+  const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
+  if (start_x >= max_x || start_y >= max_y) return true;
+  const int row = DivideBy4(start_y << subsampling_y);
+  const int column = DivideBy4(start_x << subsampling_x);
+  const int mask = sequence_header_.use_128x128_superblock ? 31 : 15;
+  const int sub_block_row4x4 = row & mask;
+  const int sub_block_column4x4 = column & mask;
+  const int step_x = kTransformWidth4x4[tx_size];
+  const int step_y = kTransformHeight4x4[tx_size];
+  const bool do_decode = mode == kProcessingModeDecodeOnly ||
+                         mode == kProcessingModeParseAndDecode;
+  if (do_decode && !bp.is_inter) {
+    if (bp.palette_mode_info.size[GetPlaneType(plane)] > 0) {
+      CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
+                             x, y, tx_size);
+    } else {
+      const PredictionMode mode =
+          (plane == kPlaneY)
+              ? bp.y_mode
+              : (bp.uv_mode == kPredictionModeChromaFromLuma ? kPredictionModeDc
+                                                             : bp.uv_mode);
+      const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
+      const int tr_column4x4 =
+          (sub_block_column4x4 >> subsampling_x) + step_x + 1;
+      const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
+      const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
+      const bool has_left = x > 0 || block.left_available[plane];
+      const bool has_top = y > 0 || block.top_available[plane];
+
+      CALL_BITDEPTH_FUNCTION(
+          IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
+          block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+          block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+          mode, tx_size);
+      if (plane != kPlaneY && bp.uv_mode == kPredictionModeChromaFromLuma) {
+        CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
+                               start_y, tx_size);
+      }
+    }
+    if (plane == kPlaneY) {
+      block.bp->prediction_parameters->max_luma_width =
+          start_x + MultiplyBy4(step_x);
+      block.bp->prediction_parameters->max_luma_height =
+          start_y + MultiplyBy4(step_y);
+      block.scratch_buffer->cfl_luma_buffer_valid = false;
+    }
+  }
+  if (!bp.skip) {
+    const int sb_row_index = SuperBlockRowIndex(block.row4x4);
+    const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
+    if (mode == kProcessingModeDecodeOnly) {
+      TransformParameterQueue& tx_params =
+          *residual_buffer_threaded_[sb_row_index][sb_column_index]
+               ->transform_parameters();
+      ReconstructBlock(block, plane, start_x, start_y, tx_size,
+                       tx_params.Type(), tx_params.NonZeroCoeffCount());
+      tx_params.Pop();
+    } else {
+      TransformType tx_type;
+      int non_zero_coeff_count;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      if (sequence_header_.color_config.bitdepth > 8) {
+        non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
+            block, plane, start_x, start_y, tx_size, &tx_type);
+      } else  // NOLINT
+#endif
+      {
+        non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
+            block, plane, start_x, start_y, tx_size, &tx_type);
+      }
+      if (non_zero_coeff_count < 0) return false;
+      if (mode == kProcessingModeParseAndDecode) {
+        ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
+                         non_zero_coeff_count);
+      } else {
+        assert(mode == kProcessingModeParseOnly);
+        residual_buffer_threaded_[sb_row_index][sb_column_index]
+            ->transform_parameters()
+            ->Push(non_zero_coeff_count, tx_type);
+      }
+    }
+  }
+  if (do_decode) {
+    bool* block_decoded =
+        &block.scratch_buffer
+             ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
+                            [(sub_block_column4x4 >> subsampling_x) + 1];
+    SetBlockValues<bool>(step_y, step_x, true, block_decoded,
+                         TileScratchBuffer::kBlockDecodedStride);
+  }
+  return true;
+}
+
+bool Tile::TransformTree(const Block& block, int start_x, int start_y,
+                         BlockSize plane_size, ProcessingMode mode) {
+  assert(plane_size <= kBlock64x64);
+  // Branching factor is 4; Maximum Depth is 4; So the maximum stack size
+  // required is (4 - 1) * 4 + 1 = 13.
+  Stack<TransformTreeNode, 13> stack;
+  // It is okay to cast BlockSize to TransformSize here since the enum are
+  // equivalent for all BlockSize values <= kBlock64x64.
+  stack.Push(TransformTreeNode(start_x, start_y,
+                               static_cast<TransformSize>(plane_size)));
+
+  do {
+    TransformTreeNode node = stack.Pop();
+    const int row = DivideBy4(node.y);
+    const int column = DivideBy4(node.x);
+    if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) {
+      continue;
+    }
+    const TransformSize inter_tx_size = inter_transform_sizes_[row][column];
+    const int width = kTransformWidth[node.tx_size];
+    const int height = kTransformHeight[node.tx_size];
+    if (width <= kTransformWidth[inter_tx_size] &&
+        height <= kTransformHeight[inter_tx_size]) {
+      if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0,
+                          mode)) {
+        return false;
+      }
+      continue;
+    }
+    // The split transform size look up gives the right transform size that we
+    // should push in the stack.
+    //   if (width > height) => transform size whose width is half.
+    //   if (width < height) => transform size whose height is half.
+    //   if (width == height) => transform size whose width and height are half.
+    const TransformSize split_tx_size = kSplitTransformSize[node.tx_size];
+    const int half_width = DivideBy2(width);
+    if (width > height) {
+      stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+      stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+      continue;
+    }
+    const int half_height = DivideBy2(height);
+    if (width < height) {
+      stack.Push(
+          TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+      stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+      continue;
+    }
+    stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height,
+                                 split_tx_size));
+    stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+    stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+    stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+  } while (!stack.Empty());
+  return true;
+}
+
+void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
+                            int start_y, TransformSize tx_size,
+                            TransformType tx_type, int non_zero_coeff_count) {
+  // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
+  assert(non_zero_coeff_count >= 0);
+  if (non_zero_coeff_count == 0) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (sequence_header_.color_config.bitdepth > 8) {
+    Array2DView<uint16_t> buffer(
+        buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+        reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+    Reconstruct(dsp_, tx_type, tx_size,
+                frame_header_.segmentation.lossless[block.bp->segment_id],
+                reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
+                &buffer, non_zero_coeff_count);
+  } else  // NOLINT
+#endif
+  {
+    Reconstruct(dsp_, tx_type, tx_size,
+                frame_header_.segmentation.lossless[block.bp->segment_id],
+                reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
+                &buffer_[plane], non_zero_coeff_count);
+  }
+  if (split_parse_and_decode_) {
+    *block.residual +=
+        kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_;
+  }
+}
+
+bool Tile::Residual(const Block& block, ProcessingMode mode) {
+  const int width_chunks = std::max(1, block.width >> 6);
+  const int height_chunks = std::max(1, block.height >> 6);
+  const BlockSize size_chunk4x4 =
+      (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
+  const BlockParameters& bp = *block.bp;
+  for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
+    for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
+      const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+      int plane = kPlaneY;
+      do {
+        const int subsampling_x = subsampling_x_[plane];
+        const int subsampling_y = subsampling_y_[plane];
+        // For Y Plane, when lossless is true |bp.transform_size| is always
+        // kTransformSize4x4. So we can simply use |bp.transform_size| here as
+        // the Y plane's transform size (part of Section 5.11.37 in the spec).
+        const TransformSize tx_size =
+            (plane == kPlaneY) ? bp.transform_size : bp.uv_transform_size;
+        const BlockSize plane_size =
+            kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y];
+        assert(plane_size != kBlockInvalid);
+        if (bp.is_inter &&
+            !frame_header_.segmentation.lossless[bp.segment_id] &&
+            plane == kPlaneY) {
+          const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y);
+          const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x);
+          const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x);
+          const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y);
+          if (!TransformTree(block, base_x, base_y, plane_size, mode)) {
+            return false;
+          }
+        } else {
+          const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+          const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+          const int step_x = kTransformWidth4x4[tx_size];
+          const int step_y = kTransformHeight4x4[tx_size];
+          const int num4x4_wide = kNum4x4BlocksWide[plane_size];
+          const int num4x4_high = kNum4x4BlocksHigh[plane_size];
+          for (int y = 0; y < num4x4_high; y += step_y) {
+            for (int x = 0; x < num4x4_wide; x += step_x) {
+              if (!TransformBlock(
+                      block, static_cast<Plane>(plane), base_x, base_y, tx_size,
+                      x + (MultiplyBy16(chunk_x) >> subsampling_x),
+                      y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) {
+                return false;
+              }
+            }
+          }
+        }
+      } while (++plane < num_planes);
+    }
+  }
+  return true;
+}
+
+// The purpose of this function is to limit the maximum size of motion vectors
+// and also, if use_intra_block_copy is true, to additionally constrain the
+// motion vector so that the data is fetched from parts of the tile that have
+// already been decoded and are not too close to the current block (in order to
+// make a pipelined decoder implementation feasible).
+bool Tile::IsMvValid(const Block& block, bool is_compound) const {
+  const BlockParameters& bp = *block.bp;
+  for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
+    for (int mv_component : bp.mv.mv[i].mv) {
+      if (std::abs(mv_component) >= (1 << 14)) {
+        return false;
+      }
+    }
+  }
+  if (!block.bp->prediction_parameters->use_intra_block_copy) {
+    return true;
+  }
+  if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
+    return false;
+  }
+  const int delta_row = bp.mv.mv[0].mv[0] >> 3;
+  const int delta_column = bp.mv.mv[0].mv[1] >> 3;
+  int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
+  int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
+  const int src_bottom_edge = src_top_edge + block.height;
+  const int src_right_edge = src_left_edge + block.width;
+  if (block.HasChroma()) {
+    if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
+      src_left_edge -= 4;
+    }
+    if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
+      src_top_edge -= 4;
+    }
+  }
+  if (src_top_edge < MultiplyBy4(row4x4_start_) ||
+      src_left_edge < MultiplyBy4(column4x4_start_) ||
+      src_bottom_edge > MultiplyBy4(row4x4_end_) ||
+      src_right_edge > MultiplyBy4(column4x4_end_)) {
+    return false;
+  }
+  // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64)
+  const int sb_height_log2 =
+      6 + static_cast<int>(sequence_header_.use_128x128_superblock);
+  const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2;
+  const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6;
+  const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2;
+  const int src_64x64_block_column = (src_right_edge - 1) >> 6;
+  const int total_64x64_blocks_per_row =
+      ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1;
+  const int active_64x64_block =
+      active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column;
+  const int src_64x64_block =
+      src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column;
+  if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) {
+    return false;
+  }
+
+  // Wavefront constraint: use only top left area of frame for reference.
+  if (src_sb_row > active_sb_row) return false;
+  const int gradient =
+      1 + kIntraBlockCopyDelay64x64Blocks +
+      static_cast<int>(sequence_header_.use_128x128_superblock);
+  const int wavefront_offset = gradient * (active_sb_row - src_sb_row);
+  return src_64x64_block_column < active_64x64_block_column -
+                                      kIntraBlockCopyDelay64x64Blocks +
+                                      wavefront_offset;
+}
+
+bool Tile::AssignInterMv(const Block& block, bool is_compound) {
+  int min[2];
+  int max[2];
+  GetClampParameters(block, min, max);
+  BlockParameters& bp = *block.bp;
+  const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  if (is_compound) {
+    for (int i = 0; i < 2; ++i) {
+      const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
+      MotionVector predicted_mv;
+      if (mode == kPredictionModeGlobalMv) {
+        predicted_mv = prediction_parameters.global_mv[i];
+      } else {
+        const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+                                  (mode == kPredictionModeNewMv &&
+                                   prediction_parameters.ref_mv_count <= 1))
+                                     ? 0
+                                     : prediction_parameters.ref_mv_index;
+        predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
+        if (ref_mv_index < prediction_parameters.ref_mv_count) {
+          predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+          predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+        }
+      }
+      if (mode == kPredictionModeNewMv) {
+        ReadMotionVector(block, i);
+        bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
+        bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
+      } else {
+        bp.mv.mv[i] = predicted_mv;
+      }
+    }
+  } else {
+    const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
+    MotionVector predicted_mv;
+    if (mode == kPredictionModeGlobalMv) {
+      predicted_mv = prediction_parameters.global_mv[0];
+    } else {
+      const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+                                (mode == kPredictionModeNewMv &&
+                                 prediction_parameters.ref_mv_count <= 1))
+                                   ? 0
+                                   : prediction_parameters.ref_mv_index;
+      predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
+      if (ref_mv_index < prediction_parameters.ref_mv_count) {
+        predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+        predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+      }
+    }
+    if (mode == kPredictionModeNewMv) {
+      ReadMotionVector(block, 0);
+      bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
+      bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
+    } else {
+      bp.mv.mv[0] = predicted_mv;
+    }
+  }
+  return IsMvValid(block, is_compound);
+}
+
+bool Tile::AssignIntraMv(const Block& block) {
+  // TODO(linfengz): Check if the clamping process is necessary.
+  int min[2];
+  int max[2];
+  GetClampParameters(block, min, max);
+  BlockParameters& bp = *block.bp;
+  const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+  ReadMotionVector(block, 0);
+  if (ref_mv_0.mv32 == 0) {
+    const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
+    if (ref_mv_1.mv32 == 0) {
+      const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
+      if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
+        bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
+        bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
+      } else {
+        bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
+      }
+    } else {
+      bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
+      bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
+    }
+  } else {
+    bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
+    bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
+  }
+  return IsMvValid(block, /*is_compound=*/false);
+}
+
+void Tile::ResetEntropyContext(const Block& block) {
+  const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+  int plane = kPlaneY;
+  do {
+    const int subsampling_x = subsampling_x_[plane];
+    const int start_x = block.column4x4 >> subsampling_x;
+    const int end_x =
+        std::min((block.column4x4 + block.width4x4) >> subsampling_x,
+                 frame_header_.columns4x4);
+    memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
+           end_x - start_x);
+    memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0,
+           end_x - start_x);
+    const int subsampling_y = subsampling_y_[plane];
+    const int start_y = block.row4x4 >> subsampling_y;
+    const int end_y =
+        std::min((block.row4x4 + block.height4x4) >> subsampling_y,
+                 frame_header_.rows4x4);
+    memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
+           end_y - start_y);
+    memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
+           end_y - start_y);
+  } while (++plane < num_planes);
+}
+
+bool Tile::ComputePrediction(const Block& block) {
+  const BlockParameters& bp = *block.bp;
+  if (!bp.is_inter) return true;
+  const int mask =
+      (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
+      1;
+  const int sub_block_row4x4 = block.row4x4 & mask;
+  const int sub_block_column4x4 = block.column4x4 & mask;
+  const int plane_count = block.HasChroma() ? PlaneCount() : 1;
+  // Returns true if this block applies local warping. The state is determined
+  // in the Y plane and carried for use in the U/V planes.
+  // But the U/V planes will not apply warping when the block size is smaller
+  // than 8x8, even if this variable is true.
+  bool is_local_valid = false;
+  // Local warping parameters, similar usage as is_local_valid.
+  GlobalMotion local_warp_params;
+  int plane = kPlaneY;
+  do {
+    const int8_t subsampling_x = subsampling_x_[plane];
+    const int8_t subsampling_y = subsampling_y_[plane];
+    const BlockSize plane_size = block.residual_size[plane];
+    const int block_width4x4 = kNum4x4BlocksWide[plane_size];
+    const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
+    const int block_width = MultiplyBy4(block_width4x4);
+    const int block_height = MultiplyBy4(block_height4x4);
+    const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+    const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+    if (bp.reference_frame[1] == kReferenceFrameIntra) {
+      const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
+      const int tr_column4x4 =
+          (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
+      const int bl_row4x4 =
+          (sub_block_row4x4 >> subsampling_y) + block_height4x4;
+      const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1;
+      const TransformSize tx_size =
+          k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
+                                 [k4x4HeightLog2[plane_size]];
+      const bool has_left = block.left_available[plane];
+      const bool has_top = block.top_available[plane];
+      CALL_BITDEPTH_FUNCTION(
+          IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
+          has_left, has_top,
+          block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+          block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+          kInterIntraToIntraMode[block.bp->prediction_parameters
+                                     ->inter_intra_mode],
+          tx_size);
+    }
+    int candidate_row = block.row4x4;
+    int candidate_column = block.column4x4;
+    bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
+    if (!some_use_intra && plane != 0) {
+      candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
+      candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
+      if (candidate_row != block.row4x4) {
+        // Top block.
+        const BlockParameters& bp_top =
+            *block_parameters_holder_.Find(candidate_row, block.column4x4);
+        some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
+        if (!some_use_intra && candidate_column != block.column4x4) {
+          // Top-left block.
+          const BlockParameters& bp_top_left =
+              *block_parameters_holder_.Find(candidate_row, candidate_column);
+          some_use_intra =
+              bp_top_left.reference_frame[0] == kReferenceFrameIntra;
+        }
+      }
+      if (!some_use_intra && candidate_column != block.column4x4) {
+        // Left block.
+        const BlockParameters& bp_left =
+            *block_parameters_holder_.Find(block.row4x4, candidate_column);
+        some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
+      }
+    }
+    int prediction_width;
+    int prediction_height;
+    if (some_use_intra) {
+      candidate_row = block.row4x4;
+      candidate_column = block.column4x4;
+      prediction_width = block_width;
+      prediction_height = block_height;
+    } else {
+      prediction_width = block.width >> subsampling_x;
+      prediction_height = block.height >> subsampling_y;
+    }
+    int r = 0;
+    int y = 0;
+    do {
+      int c = 0;
+      int x = 0;
+      do {
+        if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
+                             base_y + y, prediction_width, prediction_height,
+                             candidate_row + r, candidate_column + c,
+                             &is_local_valid, &local_warp_params)) {
+          return false;
+        }
+        ++c;
+        x += prediction_width;
+      } while (x < block_width);
+      ++r;
+      y += prediction_height;
+    } while (y < block_height);
+  } while (++plane < plane_count);
+  return true;
+}
+
+#undef CALL_BITDEPTH_FUNCTION
+
+void Tile::PopulateDeblockFilterLevel(const Block& block) {
+  if (!post_filter_.DoDeblock()) return;
+  BlockParameters& bp = *block.bp;
+  const int mode_id =
+      static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
+  for (int i = 0; i < kFrameLfCount; ++i) {
+    if (delta_lf_all_zero_) {
+      bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
+          bp.segment_id, i, bp.reference_frame[0], mode_id);
+    } else {
+      bp.deblock_filter_level[i] =
+          deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]]
+                                [mode_id];
+    }
+  }
+}
+
+bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+                        ParameterTree* const tree,
+                        TileScratchBuffer* const scratch_buffer,
+                        ResidualPtr* residual) {
+  // Do not process the block if the starting point is beyond the visible frame.
+  // This is equivalent to the has_row/has_column check in the
+  // decode_partition() section of the spec when partition equals
+  // kPartitionHorizontal or kPartitionVertical.
+  if (row4x4 >= frame_header_.rows4x4 ||
+      column4x4 >= frame_header_.columns4x4) {
+    return true;
+  }
+  BlockParameters& bp = *tree->parameters();
+  block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp);
+  Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
+  bp.size = block_size;
+  bp.prediction_parameters =
+      split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
+                                    new (std::nothrow) PredictionParameters())
+                              : std::move(prediction_parameters_);
+  if (bp.prediction_parameters == nullptr) return false;
+  if (!DecodeModeInfo(block)) return false;
+  bp.is_global_mv_block = (bp.y_mode == kPredictionModeGlobalMv ||
+                           bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+                          !IsBlockDimension4(bp.size);
+  PopulateDeblockFilterLevel(block);
+  if (!ReadPaletteTokens(block)) return false;
+  DecodeTransformSize(block);
+  // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
+  bp.uv_transform_size = frame_header_.segmentation.lossless[bp.segment_id]
+                             ? kTransformSize4x4
+                             : kUVTransformSize[block.residual_size[kPlaneU]];
+  if (bp.skip) ResetEntropyContext(block);
+  if (split_parse_and_decode_) {
+    if (!Residual(block, kProcessingModeParseOnly)) return false;
+  } else {
+    if (!ComputePrediction(block) ||
+        !Residual(block, kProcessingModeParseAndDecode)) {
+      return false;
+    }
+  }
+  // If frame_header_.segmentation.enabled is false, bp.segment_id is 0 for all
+  // blocks. We don't need to call save bp.segment_id in the current frame
+  // because the current frame's segmentation map will be cleared to all 0s.
+  //
+  // If frame_header_.segmentation.enabled is true and
+  // frame_header_.segmentation.update_map is false, we will copy the previous
+  // frame's segmentation map to the current frame. So we don't need to call
+  // save bp.segment_id in the current frame.
+  if (frame_header_.segmentation.enabled &&
+      frame_header_.segmentation.update_map) {
+    const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
+                                 static_cast<int>(block.width4x4));
+    const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
+                                 static_cast<int>(block.height4x4));
+    current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit,
+                                                 y_limit, bp.segment_id);
+  }
+  StoreMotionFieldMvsIntoCurrentFrame(block);
+  if (!split_parse_and_decode_) {
+    prediction_parameters_ = std::move(bp.prediction_parameters);
+  }
+  return true;
+}
+
+bool Tile::DecodeBlock(ParameterTree* const tree,
+                       TileScratchBuffer* const scratch_buffer,
+                       ResidualPtr* residual) {
+  const int row4x4 = tree->row4x4();
+  const int column4x4 = tree->column4x4();
+  if (row4x4 >= frame_header_.rows4x4 ||
+      column4x4 >= frame_header_.columns4x4) {
+    return true;
+  }
+  const BlockSize block_size = tree->block_size();
+  Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
+  if (!ComputePrediction(block) ||
+      !Residual(block, kProcessingModeDecodeOnly)) {
+    return false;
+  }
+  block.bp->prediction_parameters.reset(nullptr);
+  return true;
+}
+
+bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
+                            ParameterTree* const root,
+                            TileScratchBuffer* const scratch_buffer,
+                            ResidualPtr* residual) {
+  Stack<ParameterTree*, kDfsStackSize> stack;
+
+  // Set up the first iteration.
+  ParameterTree* node = root;
+  int row4x4 = row4x4_start;
+  int column4x4 = column4x4_start;
+  BlockSize block_size = SuperBlockSize();
+
+  // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
+  // Otherwise, the children are pushed into the stack for future processing.
+  do {
+    if (!stack.Empty()) {
+      // Set up subsequent iterations.
+      node = stack.Pop();
+      row4x4 = node->row4x4();
+      column4x4 = node->column4x4();
+      block_size = node->block_size();
+    }
+    if (row4x4 >= frame_header_.rows4x4 ||
+        column4x4 >= frame_header_.columns4x4) {
+      continue;
+    }
+    const int block_width4x4 = kNum4x4BlocksWide[block_size];
+    assert(block_width4x4 == kNum4x4BlocksHigh[block_size]);
+    const int half_block4x4 = block_width4x4 >> 1;
+    const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4;
+    const bool has_columns =
+        (column4x4 + half_block4x4) < frame_header_.columns4x4;
+    Partition partition;
+    if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns,
+                       &partition)) {
+      LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+    const BlockSize sub_size = kSubSize[partition][block_size];
+    // Section 6.10.4: It is a requirement of bitstream conformance that
+    // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID
+    // every time subSize is computed.
+    if (sub_size == kBlockInvalid ||
+        kPlaneResidualSize[sub_size]
+                          [sequence_header_.color_config.subsampling_x]
+                          [sequence_header_.color_config.subsampling_y] ==
+            kBlockInvalid) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "Invalid sub-block/plane size for row: %d column: %d partition: "
+          "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d",
+          row4x4, column4x4, partition, block_size, sub_size,
+          sequence_header_.color_config.subsampling_x,
+          sequence_header_.color_config.subsampling_y);
+      return false;
+    }
+    if (!node->SetPartitionType(partition)) {
+      LIBGAV1_DLOG(ERROR, "node->SetPartitionType() failed.");
+      return false;
+    }
+    switch (partition) {
+      case kPartitionNone:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, node, scratch_buffer,
+                          residual)) {
+          return false;
+        }
+        break;
+      case kPartitionSplit:
+        // The children must be added in reverse order since a stack is being
+        // used.
+        for (int i = 3; i >= 0; --i) {
+          ParameterTree* const child = node->children(i);
+          assert(child != nullptr);
+          stack.Push(child);
+        }
+        break;
+      case kPartitionHorizontal:
+      case kPartitionVertical:
+      case kPartitionHorizontalWithTopSplit:
+      case kPartitionHorizontalWithBottomSplit:
+      case kPartitionVerticalWithLeftSplit:
+      case kPartitionVerticalWithRightSplit:
+      case kPartitionHorizontal4:
+      case kPartitionVertical4:
+        for (int i = 0; i < 4; ++i) {
+          ParameterTree* const child = node->children(i);
+          // Once a null child is seen, all the subsequent children will also be
+          // null.
+          if (child == nullptr) break;
+          if (!ProcessBlock(child->row4x4(), child->column4x4(),
+                            child->block_size(), child, scratch_buffer,
+                            residual)) {
+            return false;
+          }
+        }
+        break;
+    }
+  } while (!stack.Empty());
+  return true;
+}
+
+void Tile::ResetLoopRestorationParams() {
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+      reference_unit_info_[plane].sgr_proj_info.multiplier[i] =
+          kSgrProjDefaultMultiplier[i];
+      for (int j = 0; j < kNumWienerCoefficients; ++j) {
+        reference_unit_info_[plane].wiener_info.filter[i][j] =
+            kWienerDefaultFilter[j];
+      }
+    }
+  }
+}
+
+void Tile::ResetCdef(const int row4x4, const int column4x4) {
+  if (!sequence_header_.enable_cdef) return;
+  const int row = DivideBy16(row4x4);
+  const int column = DivideBy16(column4x4);
+  cdef_index_[row][column] = -1;
+  if (sequence_header_.use_128x128_superblock) {
+    const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
+    const int border_row = DivideBy16(row4x4 + cdef_size4x4);
+    const int border_column = DivideBy16(column4x4 + cdef_size4x4);
+    cdef_index_[row][border_column] = -1;
+    cdef_index_[border_row][column] = -1;
+    cdef_index_[border_row][border_column] = -1;
+  }
+}
+
+void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
+                             int row4x4, int column4x4) {
+  // Set everything to false.
+  memset(scratch_buffer->block_decoded, 0,
+         sizeof(scratch_buffer->block_decoded));
+  // Set specific edge cases to true.
+  const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    const int subsampling_x = subsampling_x_[plane];
+    const int subsampling_y = subsampling_y_[plane];
+    const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
+    const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y;
+    // The memset is equivalent to the following lines in the spec:
+    // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) {
+    //   if ( y < 0 && x < sbWidth4 ) {
+    //     BlockDecoded[plane][y][x] = 1
+    //   }
+    // }
+    const int num_elements =
+        std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1;
+    memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements);
+    // The for loop is equivalent to the following lines in the spec:
+    // for ( y = -1; y <= ( sbSize4 >> subY ); y++ )
+    //   if ( x < 0 && y < sbHeight4 )
+    //     BlockDecoded[plane][y][x] = 1
+    //   }
+    // }
+    // BlockDecoded[plane][sbSize4 >> subY][-1] = 0
+    for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4);
+         ++y) {
+      scratch_buffer->block_decoded[plane][y + 1][0] = true;
+    }
+  }
+}
+
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+                             TileScratchBuffer* const scratch_buffer,
+                             ProcessingMode mode) {
+  const bool parsing =
+      mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
+  const bool decoding = mode == kProcessingModeDecodeOnly ||
+                        mode == kProcessingModeParseAndDecode;
+  if (parsing) {
+    read_deltas_ = frame_header_.delta_q.present;
+    ResetCdef(row4x4, column4x4);
+  }
+  if (decoding) {
+    ClearBlockDecoded(scratch_buffer, row4x4, column4x4);
+  }
+  const BlockSize block_size = SuperBlockSize();
+  if (parsing) {
+    ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
+  }
+  const int row = row4x4 / block_width4x4;
+  const int column = column4x4 / block_width4x4;
+  if (parsing && decoding) {
+    uint8_t* residual_buffer = residual_buffer_.get();
+    if (!ProcessPartition(row4x4, column4x4,
+                          block_parameters_holder_.Tree(row, column),
+                          scratch_buffer, &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
+                   column4x4);
+      return false;
+    }
+    return true;
+  }
+  const int sb_row_index = SuperBlockRowIndex(row4x4);
+  const int sb_column_index = SuperBlockColumnIndex(column4x4);
+  if (parsing) {
+    residual_buffer_threaded_[sb_row_index][sb_column_index] =
+        residual_buffer_pool_->Get();
+    if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to get residual buffer.");
+      return false;
+    }
+    uint8_t* residual_buffer =
+        residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+    if (!ProcessPartition(row4x4, column4x4,
+                          block_parameters_holder_.Tree(row, column),
+                          scratch_buffer, &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
+                   column4x4);
+      return false;
+    }
+  } else {
+    uint8_t* residual_buffer =
+        residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+    if (!DecodeSuperBlock(block_parameters_holder_.Tree(row, column),
+                          scratch_buffer, &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+    residual_buffer_pool_->Release(
+        std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
+  }
+  return true;
+}
+
+bool Tile::DecodeSuperBlock(ParameterTree* const tree,
+                            TileScratchBuffer* const scratch_buffer,
+                            ResidualPtr* residual) {
+  Stack<ParameterTree*, kDfsStackSize> stack;
+  stack.Push(tree);
+  do {
+    ParameterTree* const node = stack.Pop();
+    if (node->partition() != kPartitionNone) {
+      for (int i = 3; i >= 0; --i) {
+        if (node->children(i) == nullptr) continue;
+        stack.Push(node->children(i));
+      }
+      continue;
+    }
+    if (!DecodeBlock(node, scratch_buffer, residual)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
+                   node->row4x4(), node->column4x4());
+      return false;
+    }
+  } while (!stack.Empty());
+  return true;
+}
+
+void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+                                           BlockSize block_size) {
+  if (frame_header_.allow_intrabc) return;
+  LoopRestorationInfo* const restoration_info = post_filter_.restoration_info();
+  const bool is_superres_scaled =
+      frame_header_.width != frame_header_.upscaled_width;
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    LoopRestorationUnitInfo unit_info;
+    if (restoration_info->PopulateUnitInfoForSuperBlock(
+            static_cast<Plane>(plane), block_size, is_superres_scaled,
+            frame_header_.superres_scale_denominator, row4x4, column4x4,
+            &unit_info)) {
+      for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end;
+           ++unit_row) {
+        for (int unit_column = unit_info.column_start;
+             unit_column < unit_info.column_end; ++unit_column) {
+          const int unit_id = unit_row * restoration_info->num_horizontal_units(
+                                             static_cast<Plane>(plane)) +
+                              unit_column;
+          restoration_info->ReadUnitCoefficients(
+              &reader_, &symbol_decoder_context_, static_cast<Plane>(plane),
+              unit_id, &reference_unit_info_);
+        }
+      }
+    }
+  }
+}
+
+void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
+  if (frame_header_.refresh_frame_flags == 0 ||
+      IsIntraFrame(frame_header_.frame_type)) {
+    return;
+  }
+  // Iterate over odd rows/columns beginning at the first odd row/column for the
+  // block. It is done this way because motion field mvs are only needed at a
+  // 8x8 granularity.
+  const int row_start4x4 = block.row4x4 | 1;
+  const int row_limit4x4 =
+      std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+  if (row_start4x4 >= row_limit4x4) return;
+  const int column_start4x4 = block.column4x4 | 1;
+  const int column_limit4x4 =
+      std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+  if (column_start4x4 >= column_limit4x4) return;
+
+  // The largest reference MV component that can be saved.
+  constexpr int kRefMvsLimit = (1 << 12) - 1;
+  const BlockParameters& bp = *block.bp;
+  ReferenceInfo* reference_info = current_frame_.reference_info();
+  for (int i = 1; i >= 0; --i) {
+    const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
+    // Must make a local copy so that StoreMotionFieldMvs() knows there is no
+    // overlap between load and store.
+    const MotionVector mv_to_store = bp.mv.mv[i];
+    const int mv_row = std::abs(mv_to_store.mv[MotionVector::kRow]);
+    const int mv_column = std::abs(mv_to_store.mv[MotionVector::kColumn]);
+    if (reference_frame_to_store > kReferenceFrameIntra &&
+        // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two
+        // absolute values and then compare with kRefMvsLimit to save a branch.
+        // The next line is equivalent to:
+        // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
+        (mv_row | mv_column) <= kRefMvsLimit &&
+        reference_info->relative_distance_from[reference_frame_to_store] < 0) {
+      const int row_start8x8 = DivideBy2(row_start4x4);
+      const int row_limit8x8 = DivideBy2(row_limit4x4);
+      const int column_start8x8 = DivideBy2(column_start4x4);
+      const int column_limit8x8 = DivideBy2(column_limit4x4);
+      const int rows = row_limit8x8 - row_start8x8;
+      const int columns = column_limit8x8 - column_start8x8;
+      const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
+      ReferenceFrameType* const reference_frame_row_start =
+          &reference_info
+               ->motion_field_reference_frame[row_start8x8][column_start8x8];
+      MotionVector* const mv =
+          &reference_info->motion_field_mv[row_start8x8][column_start8x8];
+
+      // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
+      // and simplifies std::fill() for these cases.
+      if (columns <= 1) {
+        // Don't change the above condition to (columns == 1).
+        // Condition (columns <= 1) may help the compiler simplify the inlining
+        // of the general case of StoreMotionFieldMvs() by eliminating the
+        // (columns == 0) case.
+        assert(columns == 1);
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            1, reference_frame_row_start, mv);
+      } else if (columns == 2) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            2, reference_frame_row_start, mv);
+      } else if (columns == 4) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            4, reference_frame_row_start, mv);
+      } else if (columns == 8) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            8, reference_frame_row_start, mv);
+      } else if (columns == 16) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            16, reference_frame_row_start, mv);
+      } else if (columns < 16) {
+        // This always true condition (columns < 16) may help the compiler
+        // simplify the inlining of the following function.
+        // This general case is rare and usually only happens to the blocks
+        // which contain the right boundary of the frame.
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            columns, reference_frame_row_start, mv);
+      } else {
+        assert(false);
+      }
+      return;
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/tile_scratch_buffer.cc b/src/tile_scratch_buffer.cc
new file mode 100644
index 0000000..0b5ac96
--- /dev/null
+++ b/src/tile_scratch_buffer.cc
@@ -0,0 +1,26 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile_scratch_buffer.h"
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+#if !LIBGAV1_CXX17
+// static
+constexpr int TileScratchBuffer::kBlockDecodedStride;
+#endif
+
+}  // namespace libgav1
diff --git a/src/tile_scratch_buffer.h b/src/tile_scratch_buffer.h
new file mode 100644
index 0000000..3eaf8b8
--- /dev/null
+++ b/src/tile_scratch_buffer.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+
+#include <cstdint>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/constants.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+
+// Buffer to facilitate decoding a superblock.
+struct TileScratchBuffer : public MaxAlignedAllocable {
+  static constexpr int kBlockDecodedStride = 34;
+
+  LIBGAV1_MUST_USE_RESULT bool Init(int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    const int pixel_size = (bitdepth == 8) ? 1 : 2;
+#else
+    assert(bitdepth == 8);
+    static_cast<void>(bitdepth);
+    const int pixel_size = 1;
+#endif
+
+    constexpr int unaligned_convolve_buffer_stride =
+        kMaxScaledSuperBlockSizeInPixels + kConvolveBorderLeftTop +
+        kConvolveBorderRight;
+    convolve_block_buffer_stride = Align<ptrdiff_t>(
+        unaligned_convolve_buffer_stride * pixel_size, kMaxAlignment);
+    constexpr int convolve_buffer_height = kMaxScaledSuperBlockSizeInPixels +
+                                           kConvolveBorderLeftTop +
+                                           kConvolveBorderBottom;
+
+    convolve_block_buffer = MakeAlignedUniquePtr<uint8_t>(
+        kMaxAlignment, convolve_buffer_height * convolve_block_buffer_stride);
+    return convolve_block_buffer != nullptr;
+  }
+
+  // kCompoundPredictionTypeDiffWeighted prediction mode needs a mask of the
+  // prediction block size. This buffer is used to store that mask. The masks
+  // will be created for the Y plane and will be re-used for the U & V planes.
+  alignas(kMaxAlignment) uint8_t weight_mask[kMaxSuperBlockSizeSquareInPixels];
+
+  // For each instance of the TileScratchBuffer, only one of the following
+  // buffers will be used at any given time, so it is ok to share them in a
+  // union.
+  union {
+    // Buffers used for prediction process.
+    // Compound prediction calculations always output 16-bit values. Depending
+    // on the bitdepth the values may be treated as int16_t or uint16_t. See
+    // src/dsp/convolve.cc and src/dsp/warp.cc for explanations.
+    // Inter/intra calculations output Pixel values.
+    // These buffers always use width as the stride. This enables packing the
+    // values in and simplifies loads/stores for small values.
+
+    // 10/12 bit compound prediction and 10/12 bit inter/intra prediction.
+    alignas(kMaxAlignment) uint16_t
+        prediction_buffer[2][kMaxSuperBlockSizeSquareInPixels];
+    // 8 bit compound prediction buffer.
+    alignas(kMaxAlignment) int16_t
+        compound_prediction_buffer_8bpp[2][kMaxSuperBlockSizeSquareInPixels];
+
+    // Union usage note: This is used only by functions in the "intra"
+    // prediction path.
+    //
+    // Buffer used for storing subsampled luma samples needed for CFL
+    // prediction. This buffer is used to avoid repetition of the subsampling
+    // for the V plane when it is already done for the U plane.
+    int16_t cfl_luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+  };
+
+  // Buffer used for convolve. The maximum size required for this buffer is:
+  //  maximum block height (with scaling and border) = 2 * 128 + 3 + 4 = 263.
+  //  maximum block stride (with scaling and border aligned to 16) =
+  //     (2 * 128 + 3 + 8 + 5) * pixel_size = 272 * pixel_size.
+  //  Where pixel_size is (bitdepth == 8) ? 1 : 2.
+  // Has an alignment of kMaxAlignment when allocated.
+  AlignedUniquePtr<uint8_t> convolve_block_buffer;
+  ptrdiff_t convolve_block_buffer_stride;
+
+  // Flag indicating whether the data in |cfl_luma_buffer| is valid.
+  bool cfl_luma_buffer_valid;
+
+  // Equivalent to BlockDecoded array in the spec. This stores the decoded
+  // state of every 4x4 block in a superblock. It has 1 row/column border on
+  // all 4 sides (hence the 34x34 dimension instead of 32x32). Note that the
+  // spec uses "-1" as an index to access the left and top borders. In the
+  // code, we treat the index (1, 1) as equivalent to the spec's (0, 0). So
+  // all accesses into this array will be offset by +1 when compared with the
+  // spec.
+  bool block_decoded[kMaxPlanes][kBlockDecodedStride][kBlockDecodedStride];
+};
+
+class TileScratchBufferPool {
+ public:
+  void Reset(int bitdepth) {
+    if (bitdepth_ == bitdepth) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ == 8 && bitdepth != 8) {
+      // We are going from a pixel size of 1 to a pixel size of 2. So invalidate
+      // the stack.
+      std::lock_guard<std::mutex> lock(mutex_);
+      while (!buffers_.Empty()) {
+        buffers_.Pop();
+      }
+    }
+#endif
+    bitdepth_ = bitdepth;
+  }
+
+  std::unique_ptr<TileScratchBuffer> Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (buffers_.Empty()) {
+      std::unique_ptr<TileScratchBuffer> scratch_buffer(new (std::nothrow)
+                                                            TileScratchBuffer);
+      if (scratch_buffer == nullptr || !scratch_buffer->Init(bitdepth_)) {
+        return nullptr;
+      }
+      return scratch_buffer;
+    }
+    return buffers_.Pop();
+  }
+
+  void Release(std::unique_ptr<TileScratchBuffer> scratch_buffer) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffers_.Push(std::move(scratch_buffer));
+  }
+
+ private:
+  std::mutex mutex_;
+  // We will never need more than kMaxThreads scratch buffers since that is the
+  // maximum amount of work that will be done at any given time.
+  Stack<std::unique_ptr<TileScratchBuffer>, kMaxThreads> buffers_
+      LIBGAV1_GUARDED_BY(mutex_);
+  int bitdepth_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h
new file mode 100644
index 0000000..2df6241
--- /dev/null
+++ b/src/utils/array_2d.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+#define LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Exposes a 1D allocated memory buffer as a 2D array.
+template <typename T>
+class Array2DView {
+ public:
+  Array2DView() = default;
+  Array2DView(int rows, int columns, T* const data) {
+    Reset(rows, columns, data);
+  }
+
+  // Copyable and Movable.
+  Array2DView(const Array2DView& rhs) = default;
+  Array2DView& operator=(const Array2DView& rhs) = default;
+
+  void Reset(int rows, int columns, T* const data) {
+    rows_ = rows;
+    columns_ = columns;
+    data_ = data;
+  }
+
+  int rows() const { return rows_; }
+  int columns() const { return columns_; }
+
+  T* operator[](int row) { return const_cast<T*>(GetRow(row)); }
+
+  const T* operator[](int row) const { return GetRow(row); }
+
+ private:
+  const T* GetRow(int row) const {
+    assert(row < rows_);
+    const ptrdiff_t offset = static_cast<ptrdiff_t>(row) * columns_;
+    return data_ + offset;
+  }
+
+  int rows_ = 0;
+  int columns_ = 0;
+  T* data_ = nullptr;
+};
+
+// Allocates and owns the contiguous memory and exposes an Array2DView of
+// dimension |rows| x |columns|.
+template <typename T>
+class Array2D {
+ public:
+  Array2D() = default;
+
+  // Copyable and Movable.
+  Array2D(const Array2D& rhs) = default;
+  Array2D& operator=(const Array2D& rhs) = default;
+
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns,
+                                     bool zero_initialize = true) {
+    size_ = rows * columns;
+    // If T is not a trivial type, we should always reallocate the data_
+    // buffer, so that the destructors of any existing objects are invoked.
+    if (!std::is_trivial<T>::value || allocated_size_ < size_) {
+      // Note: This invokes the global operator new if T is a non-class type,
+      // such as integer or enum types, or a class type that is not derived
+      // from libgav1::Allocable, such as std::unique_ptr. If we enforce a
+      // maximum allocation size or keep track of our own heap memory
+      // consumption, we will need to handle the allocations here that use the
+      // global operator new.
+      if (zero_initialize) {
+        data_.reset(new (std::nothrow) T[size_]());
+      } else {
+        data_.reset(new (std::nothrow) T[size_]);
+      }
+      if (data_ == nullptr) {
+        allocated_size_ = 0;
+        return false;
+      }
+      allocated_size_ = size_;
+    } else if (zero_initialize) {
+      // Cast the data_ pointer to void* to avoid the GCC -Wclass-memaccess
+      // warning. The memset is safe because T is a trivial type.
+      void* dest = data_.get();
+      memset(dest, 0, sizeof(T) * size_);
+    }
+    data_view_.Reset(rows, columns, data_.get());
+    return true;
+  }
+
+  int rows() const { return data_view_.rows(); }
+  int columns() const { return data_view_.columns(); }
+  size_t size() const { return size_; }
+  T* data() { return data_.get(); }
+  const T* data() const { return data_.get(); }
+
+  T* operator[](int row) { return data_view_[row]; }
+
+  const T* operator[](int row) const { return data_view_[row]; }
+
+ private:
+  std::unique_ptr<T[]> data_ = nullptr;
+  size_t allocated_size_ = 0;
+  size_t size_ = 0;
+  Array2DView<T> data_view_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_ARRAY_2D_H_
diff --git a/src/utils/bit_mask_set.h b/src/utils/bit_mask_set.h
new file mode 100644
index 0000000..7371753
--- /dev/null
+++ b/src/utils/bit_mask_set.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+#define LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// This class is used to check if a given value is equal to one of the several
+// predetermined values using a bit mask instead of a chain of comparisons and
+// ||s. This usually results in fewer instructions.
+//
+// Usage:
+//   constexpr BitMaskSet set(value1, value2);
+//   set.Contains(value1) => returns true.
+//   set.Contains(value3) => returns false.
+class BitMaskSet {
+ public:
+  explicit constexpr BitMaskSet(uint32_t mask) : mask_(mask) {}
+
+  constexpr BitMaskSet(int v1, int v2) : mask_((1U << v1) | (1U << v2)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8, int v9)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8, int v9, int v10)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9) | (1U << v10)) {
+  }
+
+  constexpr bool Contains(uint8_t value) const {
+    return MaskContainsValue(mask_, value);
+  }
+
+  static constexpr bool MaskContainsValue(uint32_t mask, uint8_t value) {
+    return ((mask >> value) & 1) != 0;
+  }
+
+ private:
+  const uint32_t mask_;
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
diff --git a/src/utils/bit_reader.cc b/src/utils/bit_reader.cc
new file mode 100644
index 0000000..3234128
--- /dev/null
+++ b/src/utils/bit_reader.cc
@@ -0,0 +1,117 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/bit_reader.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+bool Assign(int* const value, int assignment, bool return_value) {
+  *value = assignment;
+  return return_value;
+}
+
+// 5.9.29.
+int InverseRecenter(int r, int v) {
+  if (v > (r << 1)) {
+    return v;
+  }
+  if ((v & 1) != 0) {
+    return r - ((v + 1) >> 1);
+  }
+  return r + (v >> 1);
+}
+
+}  // namespace
+
+bool BitReader::DecodeSignedSubexpWithReference(int low, int high,
+                                                int reference, int control,
+                                                int* const value) {
+  if (!DecodeUnsignedSubexpWithReference(high - low, reference - low, control,
+                                         value)) {
+    return false;
+  }
+  *value += low;
+  return true;
+}
+
+bool BitReader::DecodeUniform(int n, int* const value) {
+  if (n <= 1) {
+    return Assign(value, 0, true);
+  }
+  const int w = FloorLog2(n) + 1;
+  const int m = (1 << w) - n;
+  assert(w - 1 < 32);
+  const int v = static_cast<int>(ReadLiteral(w - 1));
+  if (v == -1) {
+    return Assign(value, 0, false);
+  }
+  if (v < m) {
+    return Assign(value, v, true);
+  }
+  const int extra_bit = ReadBit();
+  if (extra_bit == -1) {
+    return Assign(value, 0, false);
+  }
+  return Assign(value, (v << 1) - m + extra_bit, true);
+}
+
+bool BitReader::DecodeUnsignedSubexpWithReference(int mx, int reference,
+                                                  int control,
+                                                  int* const value) {
+  int v;
+  if (!DecodeSubexp(mx, control, &v)) return false;
+  if ((reference << 1) <= mx) {
+    *value = InverseRecenter(reference, v);
+  } else {
+    *value = mx - 1 - InverseRecenter(mx - 1 - reference, v);
+  }
+  return true;
+}
+
+bool BitReader::DecodeSubexp(int num_symbols, int control, int* const value) {
+  int i = 0;
+  int mk = 0;
+  while (true) {
+    const int b = (i != 0) ? control + i - 1 : control;
+    if (b >= 32) {
+      return Assign(value, 0, false);
+    }
+    const int a = 1 << b;
+    if (num_symbols <= mk + 3 * a) {
+      if (!DecodeUniform(num_symbols - mk, value)) return false;
+      *value += mk;
+      return true;
+    }
+    const int8_t subexp_more_bits = ReadBit();
+    if (subexp_more_bits == -1) return false;
+    if (subexp_more_bits != 0) {
+      ++i;
+      mk += a;
+    } else {
+      const int subexp_bits = static_cast<int>(ReadLiteral(b));
+      if (subexp_bits == -1) {
+        return Assign(value, 0, false);
+      }
+      return Assign(value, subexp_bits + mk, true);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h
new file mode 100644
index 0000000..5a10e12
--- /dev/null
+++ b/src/utils/bit_reader.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_BIT_READER_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+class BitReader {
+ public:
+  virtual ~BitReader() = default;
+
+  virtual int ReadBit() = 0;
+  // |num_bits| has to be <= 32. The function returns a value in the range [0,
+  // 2^num_bits - 1] (inclusive) on success and -1 on failure.
+  virtual int64_t ReadLiteral(int num_bits) = 0;
+
+  bool DecodeSignedSubexpWithReference(int low, int high, int reference,
+                                       int control, int* value);  // 5.9.26.
+  // Decodes a nonnegative integer with maximum number of values |n| (i.e.,
+  // output in range 0..n-1) by following the process specified in Section
+  // 4.10.7 ns(n) and Section 4.10.10 NS(n) of the spec.
+  bool DecodeUniform(int n, int* value);
+
+ private:
+  // Helper functions for DecodeSignedSubexpWithReference.
+  bool DecodeUnsignedSubexpWithReference(int mx, int reference, int control,
+                                         int* value);           // 5.9.27.
+  bool DecodeSubexp(int num_symbols, int control, int* value);  // 5.9.28.
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BIT_READER_H_
diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc
new file mode 100644
index 0000000..3ccdb9b
--- /dev/null
+++ b/src/utils/block_parameters_holder.cc
@@ -0,0 +1,107 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/block_parameters_holder.h"
+
+#include <algorithm>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/parameter_tree.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+namespace {
+
+// Returns the number of super block rows/columns for |value4x4| where value4x4
+// is either rows4x4 or columns4x4.
+int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) {
+  return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127)
+                                : DivideBy64(MultiplyBy4(value4x4) + 63);
+}
+
+}  // namespace
+
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4,
+                                  bool use_128x128_superblock) {
+  rows4x4_ = rows4x4;
+  columns4x4_ = columns4x4;
+  use_128x128_superblock_ = use_128x128_superblock;
+  if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) {
+    LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed.");
+    return false;
+  }
+  const int rows =
+      RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_);
+  const int columns =
+      RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_);
+  const BlockSize sb_size =
+      use_128x128_superblock_ ? kBlock128x128 : kBlock64x64;
+  const int multiplier = kNum4x4BlocksWide[sb_size];
+  if (!trees_.Reset(rows, columns, /*zero_initialize=*/false)) {
+    LIBGAV1_DLOG(ERROR, "trees_.Reset() failed.");
+    return false;
+  }
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < columns; ++j) {
+      trees_[i][j] =
+          ParameterTree::Create(i * multiplier, j * multiplier, sb_size);
+      if (trees_[i][j] == nullptr) {
+        LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void BlockParametersHolder::FillCache(int row4x4, int column4x4,
+                                      BlockSize block_size,
+                                      BlockParameters* const bp) {
+  int rows = std::min(static_cast<int>(kNum4x4BlocksHigh[block_size]),
+                      rows4x4_ - row4x4);
+  const int columns = std::min(static_cast<int>(kNum4x4BlocksWide[block_size]),
+                               columns4x4_ - column4x4);
+  auto* bp_dst = &block_parameters_cache_[row4x4][column4x4];
+  // Specialize columns cases (values in kNum4x4BlocksWide[]) for better
+  // performance.
+  if (columns == 1) {
+    SetBlock<BlockParameters*>(rows, 1, bp, bp_dst, columns4x4_);
+  } else if (columns == 2) {
+    SetBlock<BlockParameters*>(rows, 2, bp, bp_dst, columns4x4_);
+  } else if (columns == 4) {
+    SetBlock<BlockParameters*>(rows, 4, bp, bp_dst, columns4x4_);
+  } else if (columns == 8) {
+    SetBlock<BlockParameters*>(rows, 8, bp, bp_dst, columns4x4_);
+  } else if (columns == 16) {
+    SetBlock<BlockParameters*>(rows, 16, bp, bp_dst, columns4x4_);
+  } else if (columns == 32) {
+    SetBlock<BlockParameters*>(rows, 32, bp, bp_dst, columns4x4_);
+  } else {
+    do {
+      // The following loop has better performance than using std::fill().
+      // std::fill() has some overhead in checking zero loop count.
+      int x = columns;
+      auto* d = bp_dst;
+      do {
+        *d++ = bp;
+      } while (--x != 0);
+      bp_dst += columns4x4_;
+    } while (--rows != 0);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h
new file mode 100644
index 0000000..35543c3
--- /dev/null
+++ b/src/utils/block_parameters_holder.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/parameter_tree.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters
+// corresponding to a superblock.
+class BlockParametersHolder {
+ public:
+  BlockParametersHolder() = default;
+
+  // Not copyable or movable.
+  BlockParametersHolder(const BlockParametersHolder&) = delete;
+  BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
+
+  // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
+  // otherwise 64x64 superblocks will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4,
+                                     bool use_128x128_superblock);
+
+  // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
+  // is done as a simple look up of the |block_parameters_cache_| matrix.
+  // Returns nullptr if the BlockParameters cannot be found.
+  BlockParameters* Find(int row4x4, int column4x4) const {
+    return block_parameters_cache_[row4x4][column4x4];
+  }
+
+  BlockParameters** Address(int row4x4, int column4x4) {
+    return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+  }
+
+  BlockParameters* const* Address(int row4x4, int column4x4) const {
+    return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+  }
+
+  int columns4x4() const { return columns4x4_; }
+
+  // Returns the ParameterTree corresponding to superblock starting at (|row|,
+  // |column|).
+  ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); }
+
+  // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of
+  // size |block_size| with the pointer |bp|.
+  void FillCache(int row4x4, int column4x4, BlockSize block_size,
+                 BlockParameters* bp);
+
+ private:
+  int rows4x4_ = 0;
+  int columns4x4_ = 0;
+  bool use_128x128_superblock_ = false;
+  Array2D<std::unique_ptr<ParameterTree>> trees_;
+
+  // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
+  // FillCache() and used by Find() to perform look ups using exactly one look
+  // up (instead of traversing the entire tree).
+  Array2D<BlockParameters*> block_parameters_cache_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
diff --git a/src/utils/blocking_counter.h b/src/utils/blocking_counter.h
new file mode 100644
index 0000000..6d664f8
--- /dev/null
+++ b/src/utils/blocking_counter.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+#define LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+
+#include <cassert>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <mutex>               // NOLINT (unapproved c++11 header)
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Implementation of a Blocking Counter that is used for the "fork-join"
+// use case. Typical usage would be as follows:
+//   BlockingCounter counter(num_jobs);
+//     - spawn the jobs.
+//     - call counter.Wait() on the master thread.
+//     - worker threads will call counter.Decrement().
+//     - master thread will return from counter.Wait() when all workers are
+//     complete.
+template <bool has_failure_status>
+class BlockingCounterImpl {
+ public:
+  explicit BlockingCounterImpl(int initial_count)
+      : count_(initial_count), job_failed_(false) {}
+
+  // Increment the counter by |count|. This must be called before Wait() is
+  // called. This must be called from the same thread that will call Wait().
+  void IncrementBy(int count) {
+    assert(count >= 0);
+    std::unique_lock<std::mutex> lock(mutex_);
+    count_ += count;
+  }
+
+  // Decrement the counter by 1. This function can be called only when
+  // |has_failure_status| is false (i.e.) when this class is being used with the
+  // |BlockingCounter| alias.
+  void Decrement() {
+    static_assert(!has_failure_status, "");
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (--count_ == 0) {
+      condition_.notify_one();
+    }
+  }
+
+  // Decrement the counter by 1. This function can be called only when
+  // |has_failure_status| is true (i.e.) when this class is being used with the
+  // |BlockingCounterWithStatus| alias. |job_succeeded| is used to update the
+  // state of |job_failed_|.
+  void Decrement(bool job_succeeded) {
+    static_assert(has_failure_status, "");
+    std::unique_lock<std::mutex> lock(mutex_);
+    job_failed_ |= !job_succeeded;
+    if (--count_ == 0) {
+      condition_.notify_one();
+    }
+  }
+
+  // Block until the counter becomes 0. This function can be called only once
+  // per object. If |has_failure_status| is true, true is returned if all the
+  // jobs succeeded and false is returned if any of the jobs failed. If
+  // |has_failure_status| is false, this function always returns true.
+  bool Wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    condition_.wait(lock, [this]() { return count_ == 0; });
+    // If |has_failure_status| is false, we simply return true.
+    return has_failure_status ? !job_failed_ : true;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  int count_ LIBGAV1_GUARDED_BY(mutex_);
+  bool job_failed_ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+using BlockingCounterWithStatus = BlockingCounterImpl<true>;
+using BlockingCounter = BlockingCounterImpl<false>;
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
diff --git a/src/utils/common.h b/src/utils/common.h
new file mode 100644
index 0000000..ae43c2b
--- /dev/null
+++ b/src/utils/common.h
@@ -0,0 +1,534 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMMON_H_
+#define LIBGAV1_SRC_UTILS_COMMON_H_
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
+#pragma intrinsic(_BitScanReverse64)
+#define HAVE_BITSCANREVERSE64
+#endif  // defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
+#endif  // defined(_MSC_VER)
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <type_traits>
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Aligns |value| to the desired |alignment|. |alignment| must be a power of 2.
+template <typename T>
+inline T Align(T value, T alignment) {
+  assert(alignment != 0);
+  const T alignment_mask = alignment - 1;
+  return (value + alignment_mask) & ~alignment_mask;
+}
+
+// Aligns |addr| to the desired |alignment|. |alignment| must be a power of 2.
+inline uint8_t* AlignAddr(uint8_t* const addr, const uintptr_t alignment) {
+  const auto value = reinterpret_cast<uintptr_t>(addr);
+  return reinterpret_cast<uint8_t*>(Align(value, alignment));
+}
+
+inline int32_t Clip3(int32_t value, int32_t low, int32_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+template <typename Pixel>
+void ExtendLine(void* const line_start, const int width, const int left,
+                const int right) {
+  auto* const start = static_cast<Pixel*>(line_start);
+  const Pixel* src = start;
+  Pixel* dst = start - left;
+  // Copy to left and right borders.
+  Memset(dst, src[0], left);
+  Memset(dst + left + width, src[width - 1], right);
+}
+
+// The following 2 templates set a block of data with uncontiguous memory to
+// |value|. The compilers usually generate several branches to handle different
+// cases of |columns| when inlining memset() and std::fill(), and these branches
+// are unfortunately within the loop of |rows|. So calling these templates
+// directly could be inefficient. It is recommended to specialize common cases
+// of |columns|, such as 1, 2, 4, 8, 16 and 32, etc. in advance before
+// processing the generic case of |columns|. The code size may be larger, but
+// there would be big speed gains.
+// Call template MemSetBlock<> when sizeof(|T|) is 1.
+// Call template SetBlock<> when sizeof(|T|) is larger than 1.
+template <typename T>
+void MemSetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  static_assert(sizeof(T) == 1, "");
+  do {
+    memset(dst, value, columns);
+    dst += stride;
+  } while (--rows != 0);
+}
+
+template <typename T>
+void SetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  do {
+    std::fill(dst, dst + columns, value);
+    dst += stride;
+  } while (--rows != 0);
+}
+
+#if defined(__GNUC__)
+
+inline int CountLeadingZeros(uint32_t n) {
+  assert(n != 0);
+  return __builtin_clz(n);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+  assert(n != 0);
+  return __builtin_clzll(n);
+}
+
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  return __builtin_ctz(n);
+}
+
+#elif defined(_MSC_VER)
+
+inline int CountLeadingZeros(uint32_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  const unsigned char bit_set = _BitScanReverse(&first_set_bit, n);
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return 31 ^ static_cast<int>(first_set_bit);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+#if defined(HAVE_BITSCANREVERSE64)
+  const unsigned char bit_set =
+      _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
+#else  // !defined(HAVE_BITSCANREVERSE64)
+  const auto n_hi = static_cast<unsigned long>(n >> 32);  // NOLINT(runtime/int)
+  if (n_hi != 0) {
+    const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
+    assert(bit_set != 0);
+    static_cast<void>(bit_set);
+    return 31 ^ static_cast<int>(first_set_bit);
+  }
+  const unsigned char bit_set = _BitScanReverse(
+      &first_set_bit, static_cast<unsigned long>(n));  // NOLINT(runtime/int)
+#endif  // defined(HAVE_BITSCANREVERSE64)
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return 63 ^ static_cast<int>(first_set_bit);
+}
+
+#undef HAVE_BITSCANREVERSE64
+
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  const unsigned char bit_set = _BitScanForward(&first_set_bit, n);
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return static_cast<int>(first_set_bit);
+}
+
+#else  // !defined(__GNUC__) && !defined(_MSC_VER)
+
+template <const int kMSB, typename T>
+inline int CountLeadingZeros(T n) {
+  assert(n != 0);
+  const T msb = T{1} << kMSB;
+  int count = 0;
+  while ((n & msb) == 0) {
+    ++count;
+    n <<= 1;
+  }
+  return count;
+}
+
+inline int CountLeadingZeros(uint32_t n) { return CountLeadingZeros<31>(n); }
+
+inline int CountLeadingZeros(uint64_t n) { return CountLeadingZeros<63>(n); }
+
+// This is the algorithm on the left in Figure 5-23, Hacker's Delight, Second
+// Edition, page 109. The book says:
+//   If the number of trailing 0's is expected to be small or large, then the
+//   simple loops shown in Figure 5-23 are quite fast.
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  // Create a word with 1's at the positions of the trailing 0's in |n|, and
+  // 0's elsewhere (e.g., 01011000 => 00000111).
+  n = ~n & (n - 1);
+  int count = 0;
+  while (n != 0) {
+    ++count;
+    n >>= 1;
+  }
+  return count;
+}
+
+#endif  // defined(__GNUC__)
+
+inline int FloorLog2(int32_t n) {
+  assert(n > 0);
+  return 31 ^ CountLeadingZeros(static_cast<uint32_t>(n));
+}
+
+inline int FloorLog2(uint32_t n) {
+  assert(n > 0);
+  return 31 ^ CountLeadingZeros(n);
+}
+
+inline int FloorLog2(int64_t n) {
+  assert(n > 0);
+  return 63 ^ CountLeadingZeros(static_cast<uint64_t>(n));
+}
+
+inline int FloorLog2(uint64_t n) {
+  assert(n > 0);
+  return 63 ^ CountLeadingZeros(n);
+}
+
+inline int CeilLog2(unsigned int n) {
+  // The expression FloorLog2(n - 1) + 1 is undefined not only for n == 0 but
+  // also for n == 1, so this expression must be guarded by the n < 2 test. An
+  // alternative implementation is:
+  // return (n == 0) ? 0 : FloorLog2(n) + static_cast<int>((n & (n - 1)) != 0);
+  return (n < 2) ? 0 : FloorLog2(n - 1) + 1;
+}
+
+inline int RightShiftWithCeiling(int value, int bits) {
+  assert(bits > 0);
+  return (value + (1 << bits) - 1) >> bits;
+}
+
+inline int32_t RightShiftWithRounding(int32_t value, int bits) {
+  assert(bits >= 0);
+  return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+inline uint32_t RightShiftWithRounding(uint32_t value, int bits) {
+  assert(bits >= 0);
+  return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRounding(int64_t value, int bits) {
+  assert(bits >= 0);
+  return static_cast<int32_t>((value + ((int64_t{1} << bits) >> 1)) >> bits);
+}
+
+inline int32_t RightShiftWithRoundingSigned(int32_t value, int bits) {
+  assert(bits > 0);
+  // The next line is equivalent to:
+  // return (value >= 0) ? RightShiftWithRounding(value, bits)
+  //                     : -RightShiftWithRounding(-value, bits);
+  return RightShiftWithRounding(value + (value >> 31), bits);
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRoundingSigned(int64_t value, int bits) {
+  assert(bits > 0);
+  // The next line is equivalent to:
+  // return (value >= 0) ? RightShiftWithRounding(value, bits)
+  //                     : -RightShiftWithRounding(-value, bits);
+  return RightShiftWithRounding(value + (value >> 63), bits);
+}
+
+constexpr int DivideBy2(int n) { return n >> 1; }
+constexpr int DivideBy4(int n) { return n >> 2; }
+constexpr int DivideBy8(int n) { return n >> 3; }
+constexpr int DivideBy16(int n) { return n >> 4; }
+constexpr int DivideBy32(int n) { return n >> 5; }
+constexpr int DivideBy64(int n) { return n >> 6; }
+constexpr int DivideBy128(int n) { return n >> 7; }
+
+// Convert |value| to unsigned before shifting to avoid undefined behavior with
+// negative values.
+inline int LeftShift(int value, int bits) {
+  assert(bits >= 0);
+  assert(value >= -(int64_t{1} << (31 - bits)));
+  assert(value <= (int64_t{1} << (31 - bits)) - ((bits == 0) ? 1 : 0));
+  return static_cast<int>(static_cast<uint32_t>(value) << bits);
+}
+inline int MultiplyBy2(int n) { return LeftShift(n, 1); }
+inline int MultiplyBy4(int n) { return LeftShift(n, 2); }
+inline int MultiplyBy8(int n) { return LeftShift(n, 3); }
+inline int MultiplyBy16(int n) { return LeftShift(n, 4); }
+inline int MultiplyBy32(int n) { return LeftShift(n, 5); }
+inline int MultiplyBy64(int n) { return LeftShift(n, 6); }
+
+constexpr int Mod32(int n) { return n & 0x1f; }
+constexpr int Mod64(int n) { return n & 0x3f; }
+
+//------------------------------------------------------------------------------
+// Bitstream functions
+
+constexpr bool IsIntraFrame(FrameType type) {
+  return type == kFrameKey || type == kFrameIntraOnly;
+}
+
+inline TransformClass GetTransformClass(TransformType tx_type) {
+  constexpr BitMaskSet kTransformClassVerticalMask(
+      kTransformTypeIdentityDct, kTransformTypeIdentityAdst,
+      kTransformTypeIdentityFlipadst);
+  if (kTransformClassVerticalMask.Contains(tx_type)) {
+    return kTransformClassVertical;
+  }
+  constexpr BitMaskSet kTransformClassHorizontalMask(
+      kTransformTypeDctIdentity, kTransformTypeAdstIdentity,
+      kTransformTypeFlipadstIdentity);
+  if (kTransformClassHorizontalMask.Contains(tx_type)) {
+    return kTransformClassHorizontal;
+  }
+  return kTransformClass2D;
+}
+
+inline int RowOrColumn4x4ToPixel(int row_or_column4x4, Plane plane,
+                                 int8_t subsampling) {
+  return MultiplyBy4(row_or_column4x4) >> (plane == kPlaneY ? 0 : subsampling);
+}
+
+constexpr PlaneType GetPlaneType(Plane plane) {
+  return static_cast<PlaneType>(plane != kPlaneY);
+}
+
+// 5.11.44.
+constexpr bool IsDirectionalMode(PredictionMode mode) {
+  return mode >= kPredictionModeVertical && mode <= kPredictionModeD67;
+}
+
+// 5.9.3.
+//
+// |a| and |b| are order hints, treated as unsigned order_hint_bits-bit
+// integers. |order_hint_shift_bits| equals (32 - order_hint_bits) % 32.
+// order_hint_bits is at most 8, so |order_hint_shift_bits| is zero or a
+// value between 24 and 31 (inclusive).
+//
+// If |order_hint_shift_bits| is zero, |a| and |b| are both zeros, and the
+// result is zero. If |order_hint_shift_bits| is not zero, returns the
+// signed difference |a| - |b| using "modular arithmetic". More precisely, the
+// signed difference |a| - |b| is treated as a signed order_hint_bits-bit
+// integer and cast to an int. The returned difference is between
+// -(1 << (order_hint_bits - 1)) and (1 << (order_hint_bits - 1)) - 1
+// (inclusive).
+//
+// NOTE: |a| and |b| are the order_hint_bits least significant bits of the
+// actual values. This function returns the signed difference between the
+// actual values. The returned difference is correct as long as the actual
+// values are not more than 1 << (order_hint_bits - 1) - 1 apart.
+//
+// Example: Suppose order_hint_bits is 4 and |order_hint_shift_bits|
+// is 28. Then |a| and |b| are in the range [0, 15], and the actual values for
+// |a| and |b| must not be more than 7 apart. (If the actual values for |a| and
+// |b| are exactly 8 apart, this function cannot tell whether the actual value
+// for |a| is before or after the actual value for |b|.)
+//
+// First, consider the order hints 2 and 6. For this simple case, we have
+//   GetRelativeDistance(2, 6, 28) = 2 - 6 = -4, and
+//   GetRelativeDistance(6, 2, 28) = 6 - 2 = 4.
+//
+// On the other hand, consider the order hints 2 and 14. The order hints are
+// 12 (> 7) apart, so we need to use the actual values instead. The actual
+// values may be 34 (= 2 mod 16) and 30 (= 14 mod 16), respectively. Therefore
+// we have
+//   GetRelativeDistance(2, 14, 28) = 34 - 30 = 4, and
+//   GetRelativeDistance(14, 2, 28) = 30 - 34 = -4.
+//
+// The following comments apply only to specific CPUs' SIMD implementations,
+// such as intrinsics code.
+// For the 2 shift operations in this function, if the SIMD packed data is
+// 16-bit wide, try to use |order_hint_shift_bits| - 16 as the number of bits to
+// shift; If the SIMD packed data is 8-bit wide, try to use
+// |order_hint_shift_bits| - 24 as as the number of bits to shift.
+// |order_hint_shift_bits| - 16 and |order_hint_shift_bits| - 24 could be -16 or
+// -24. In these cases diff is 0, and the behavior of left or right shifting -16
+// or -24 bits is defined for x86 SIMD instructions and ARM NEON instructions,
+// and the result of shifting 0 is still 0. There is no guarantee that this
+// behavior and result apply to other CPUs' SIMD instructions.
+inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
+                               const unsigned int order_hint_shift_bits) {
+  const int diff = a - b;
+  assert(order_hint_shift_bits <= 31);
+  if (order_hint_shift_bits == 0) {
+    assert(a == 0);
+    assert(b == 0);
+  } else {
+    assert(order_hint_shift_bits >= 24);  // i.e., order_hint_bits <= 8
+    assert(a < (1u << (32 - order_hint_shift_bits)));
+    assert(b < (1u << (32 - order_hint_shift_bits)));
+    assert(diff < (1 << (32 - order_hint_shift_bits)));
+    assert(diff >= -(1 << (32 - order_hint_shift_bits)));
+  }
+  // Sign extend the result of subtracting the values.
+  // Cast to unsigned int and then left shift to avoid undefined behavior with
+  // negative values. Cast to int to do the sign extension through right shift.
+  // This requires the right shift of a signed integer be an arithmetic shift,
+  // which is true for clang, gcc, and Visual C++.
+  // These two casts do not generate extra instructions.
+  // Don't use LeftShift(diff) since a valid diff may fail its assertions.
+  // For example, GetRelativeDistance(2, 14, 28), diff equals -12 and is less
+  // than the minimum allowed value of LeftShift() which is -8.
+  // The next 3 lines are equivalent to:
+  // const int order_hint_bits = Mod32(32 - order_hint_shift_bits);
+  // const int m = (1 << order_hint_bits) >> 1;
+  // return (diff & (m - 1)) - (diff & m);
+  return static_cast<int>(static_cast<unsigned int>(diff)
+                          << order_hint_shift_bits) >>
+         order_hint_shift_bits;
+}
+
+// Applies |sign| (must be 0 or -1) to |value|, i.e.,
+//   return (sign == 0) ? value : -value;
+// and does so without a branch.
+constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; }
+
+// 7.9.3. (without the clamp for numerator and denominator).
+inline void GetMvProjection(const MotionVector& mv, int numerator,
+                            int division_multiplier,
+                            MotionVector* projection_mv) {
+  // Allow numerator and to be 0 so that this function can be called
+  // unconditionally. When numerator is 0, |projection_mv| will be 0, and this
+  // is what we want.
+  assert(std::abs(numerator) <= kMaxFrameDistance);
+  for (int i = 0; i < 2; ++i) {
+    projection_mv->mv[i] =
+        Clip3(RightShiftWithRoundingSigned(
+                  mv.mv[i] * numerator * division_multiplier, 14),
+              -kProjectionMvClamp, kProjectionMvClamp);
+  }
+}
+
+// 7.9.4.
+constexpr int Project(int value, int delta, int dst_sign) {
+  return value + ApplySign(delta / 64, dst_sign);
+}
+
+inline bool IsBlockSmallerThan8x8(BlockSize size) {
+  return size < kBlock8x8 && size != kBlock4x16;
+}
+
+// Returns true if the either the width or the height of the block is equal to
+// four.
+inline bool IsBlockDimension4(BlockSize size) {
+  return size < kBlock8x8 || size == kBlock16x4;
+}
+
+// Converts bitdepth 8, 10, and 12 to array index 0, 1, and 2, respectively.
+constexpr int BitdepthToArrayIndex(int bitdepth) { return (bitdepth - 8) >> 1; }
+
+// Maps a square transform to an index between [0, 4]. kTransformSize4x4 maps
+// to 0, kTransformSize8x8 maps to 1 and so on.
+inline int TransformSizeToSquareTransformIndex(TransformSize tx_size) {
+  assert(kTransformWidth[tx_size] == kTransformHeight[tx_size]);
+
+  // The values of the square transform sizes happen to be in the right
+  // ranges, so we can just divide them by 4 to get the indexes.
+  static_assert(
+      std::is_unsigned<std::underlying_type<TransformSize>::type>::value, "");
+  static_assert(kTransformSize4x4 < 4, "");
+  static_assert(4 <= kTransformSize8x8 && kTransformSize8x8 < 8, "");
+  static_assert(8 <= kTransformSize16x16 && kTransformSize16x16 < 12, "");
+  static_assert(12 <= kTransformSize32x32 && kTransformSize32x32 < 16, "");
+  static_assert(16 <= kTransformSize64x64 && kTransformSize64x64 < 20, "");
+  return DivideBy4(tx_size);
+}
+
+// Gets the corresponding Y/U/V position, to set and get filter masks
+// in deblock filtering.
+// Returns luma_position if it's Y plane, whose subsampling must be 0.
+// Returns the odd position for U/V plane, if there is subsampling.
+constexpr int GetDeblockPosition(const int luma_position,
+                                 const int subsampling) {
+  return luma_position | subsampling;
+}
+
+// Returns the size of the residual buffer required to hold the residual values
+// for a block or frame of size |rows| by |columns| (taking into account
+// |subsampling_x|, |subsampling_y| and |residual_size|). |residual_size| is the
+// number of bytes required to represent one residual value.
+inline size_t GetResidualBufferSize(const int rows, const int columns,
+                                    const int subsampling_x,
+                                    const int subsampling_y,
+                                    const size_t residual_size) {
+  // The subsampling multipliers are:
+  //   Both x and y are subsampled: 3 / 2.
+  //   Only x or y is subsampled: 2 / 1 (which is equivalent to 4 / 2).
+  //   Both x and y are not subsampled: 3 / 1 (which is equivalent to 6 / 2).
+  // So we compute the final subsampling multiplier as follows:
+  //   multiplier = (2 + (4 >> subsampling_x >> subsampling_y)) / 2.
+  // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary checks
+  // when parsing quantized coefficients.
+  const int subsampling_multiplier_num =
+      2 + (4 >> subsampling_x >> subsampling_y);
+  const int number_elements =
+      (rows * columns * subsampling_multiplier_num) >> 1;
+  const int tx_padding = 32 * kResidualPaddingVertical;
+  return residual_size * (number_elements + tx_padding);
+}
+
+// This function is equivalent to:
+// std::min({kTransformWidthLog2[tx_size] - 2,
+//           kTransformWidthLog2[left_tx_size] - 2,
+//           2});
+constexpr LoopFilterTransformSizeId GetTransformSizeIdWidth(
+    TransformSize tx_size, TransformSize left_tx_size) {
+  return static_cast<LoopFilterTransformSizeId>(
+      static_cast<int>(tx_size > kTransformSize4x16 &&
+                       left_tx_size > kTransformSize4x16) +
+      static_cast<int>(tx_size > kTransformSize8x32 &&
+                       left_tx_size > kTransformSize8x32));
+}
+
+// This is used for 7.11.3.4 Block Inter Prediction Process, to select convolve
+// filters.
+inline int GetFilterIndex(const int filter_index, const int length) {
+  if (length <= 4) {
+    if (filter_index == kInterpolationFilterEightTap ||
+        filter_index == kInterpolationFilterEightTapSharp) {
+      return 4;
+    }
+    if (filter_index == kInterpolationFilterEightTapSmooth) {
+      return 5;
+    }
+  }
+  return filter_index;
+}
+
+// This has identical results as RightShiftWithRounding since |subsampling| can
+// only be 0 or 1.
+constexpr int SubsampledValue(int value, int subsampling) {
+  return (value + subsampling) >> subsampling;
+}
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_COMMON_H_
diff --git a/src/utils/compiler_attributes.h b/src/utils/compiler_attributes.h
new file mode 100644
index 0000000..e122426
--- /dev/null
+++ b/src/utils/compiler_attributes.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+#define LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+
+// A collection of compiler attribute checks and defines to control for
+// compatibility across toolchains.
+
+//------------------------------------------------------------------------------
+// Language version, attribute and feature helpers.
+
+// Detect c++17 support. Visual Studio sets __cplusplus to 199711L by default
+// unless compiled with /Zc:__cplusplus, use the value controlled by /std
+// instead.
+// https://docs.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define LIBGAV1_CXX17 1
+#else
+#define LIBGAV1_CXX17 0
+#endif
+
+#if defined(__has_attribute)
+#define LIBGAV1_HAS_ATTRIBUTE __has_attribute
+#else
+#define LIBGAV1_HAS_ATTRIBUTE(x) 0
+#endif
+
+#if defined(__has_feature)
+#define LIBGAV1_HAS_FEATURE __has_feature
+#else
+#define LIBGAV1_HAS_FEATURE(x) 0
+#endif
+
+//------------------------------------------------------------------------------
+// Sanitizer attributes.
+
+#if LIBGAV1_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define LIBGAV1_ASAN 1
+#else
+#define LIBGAV1_ASAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(memory_sanitizer)
+#define LIBGAV1_MSAN 1
+#else
+#define LIBGAV1_MSAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(thread_sanitizer) || defined(__SANITIZE_THREAD__)
+#define LIBGAV1_TSAN 1
+#else
+#define LIBGAV1_TSAN 0
+#endif
+
+//------------------------------------------------------------------------------
+// AddressSanitizer support.
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if LIBGAV1_ASAN
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+  (static_cast<void>(addr), static_cast<void>(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+  (static_cast<void>(addr), static_cast<void>(size))
+#endif
+
+//------------------------------------------------------------------------------
+// Function attributes.
+// GCC: https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
+// Clang: https://clang.llvm.org/docs/AttributeReference.html
+
+#if defined(__GNUC__)
+#define LIBGAV1_ALWAYS_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define LIBGAV1_ALWAYS_INLINE __forceinline
+#else
+#define LIBGAV1_ALWAYS_INLINE inline
+#endif
+
+// LIBGAV1_MUST_USE_RESULT
+//
+// Tells the compiler to warn about unused results.
+//
+// When annotating a function, it must appear as the first part of the
+// declaration or definition. The compiler will warn if the return value from
+// such a function is unused:
+//
+//   LIBGAV1_MUST_USE_RESULT Sprocket* AllocateSprocket();
+//   AllocateSprocket();  // Triggers a warning.
+//
+// When annotating a class, it is equivalent to annotating every function which
+// returns an instance.
+//
+//   class LIBGAV1_MUST_USE_RESULT Sprocket {};
+//   Sprocket();  // Triggers a warning.
+//
+//   Sprocket MakeSprocket();
+//   MakeSprocket();  // Triggers a warning.
+//
+// Note that references and pointers are not instances:
+//
+//   Sprocket* SprocketPointer();
+//   SprocketPointer();  // Does *not* trigger a warning.
+//
+// LIBGAV1_MUST_USE_RESULT allows using cast-to-void to suppress the unused
+// result warning. For that, warn_unused_result is used only for clang but not
+// for gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425
+#if LIBGAV1_HAS_ATTRIBUTE(nodiscard)
+#define LIBGAV1_MUST_USE_RESULT [[nodiscard]]
+#elif defined(__clang__) && LIBGAV1_HAS_ATTRIBUTE(warn_unused_result)
+#define LIBGAV1_MUST_USE_RESULT __attribute__((warn_unused_result))
+#else
+#define LIBGAV1_MUST_USE_RESULT
+#endif
+
+// LIBGAV1_PRINTF_ATTRIBUTE
+//
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+//
+// Note: As the GCC manual states, "[s]ince non-static C++ methods
+// have an implicit 'this' argument, the arguments of such methods
+// should be counted from two, not one."
+#if LIBGAV1_HAS_ATTRIBUTE(format) || (defined(__GNUC__) && !defined(__clang__))
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#else
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check)
+#endif
+
+//------------------------------------------------------------------------------
+// Thread annotations.
+
+// LIBGAV1_GUARDED_BY()
+//
+// Documents if a shared field or global variable needs to be protected by a
+// mutex. LIBGAV1_GUARDED_BY() allows the user to specify a particular mutex
+// that should be held when accessing the annotated variable.
+//
+// Although this annotation cannot be applied to local variables, a local
+// variable and its associated mutex can often be combined into a small class
+// or struct, thereby allowing the annotation.
+//
+// Example:
+//
+//   class Foo {
+//     Mutex mu_;
+//     int p1_ LIBGAV1_GUARDED_BY(mu_);
+//     ...
+//   };
+// TODO(b/132506370): this can be reenabled after a local MutexLock
+// implementation is added with proper thread annotations.
+#if 0  // LIBGAV1_HAS_ATTRIBUTE(guarded_by)
+#define LIBGAV1_GUARDED_BY(x) __attribute__((guarded_by(x)))
+#else
+#define LIBGAV1_GUARDED_BY(x)
+#endif
+
+//------------------------------------------------------------------------------
+
+#undef LIBGAV1_HAS_ATTRIBUTE
+#undef LIBGAV1_HAS_FEATURE
+
+#endif  // LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
diff --git a/src/utils/constants.cc b/src/utils/constants.cc
new file mode 100644
index 0000000..80d7acb
--- /dev/null
+++ b/src/utils/constants.cc
@@ -0,0 +1,874 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+const uint8_t k4x4WidthLog2[kMaxBlockSizes] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+                                               2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5};
+
+const uint8_t k4x4HeightLog2[kMaxBlockSizes] = {
+    0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 5, 4, 5};
+
+const uint8_t kNum4x4BlocksWide[kMaxBlockSizes] = {
+    1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32};
+
+const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes] = {
+    1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16, 32, 16, 32};
+
+const uint8_t kBlockWidthPixels[kMaxBlockSizes] = {
+    4,  4,  4,  8,  8,  8,  8,  16, 16, 16,  16,
+    16, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128};
+
+const uint8_t kBlockHeightPixels[kMaxBlockSizes] = {
+    4,  8, 16, 4,  8,  16, 32, 4,  8,   16, 32,
+    64, 8, 16, 32, 64, 16, 32, 64, 128, 64, 128};
+
+// 9.3 -- Partition_Subsize[]
+const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes] = {
+    // kPartitionNone
+    {kBlock4x4,     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x64,   kBlockInvalid,
+     kBlockInvalid, kBlock128x128},
+    // kPartitionHorizontal
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionVertical
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32,   kBlockInvalid,
+     kBlockInvalid, kBlock64x64},
+    // kPartitionHorizontalWithTopSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionHorizontalWithBottomSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionVerticalWithLeftSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionVerticalWithRightSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionHorizontal4
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x16,   kBlockInvalid,
+     kBlockInvalid, kBlockInvalid},
+    // kPartitionVertical4
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x64,   kBlockInvalid,
+     kBlockInvalid, kBlockInvalid}};
+
+// 5.11.38 (implemented as a simple look up. first dimension is block size,
+// second and third are subsampling_x and subsampling_y).
+const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2] = {
+    {{kBlock4x4, kBlock4x4}, {kBlock4x4, kBlock4x4}},
+    {{kBlock4x8, kBlock4x4}, {kBlockInvalid, kBlock4x4}},
+    {{kBlock4x16, kBlock4x8}, {kBlockInvalid, kBlock4x8}},
+    {{kBlock8x4, kBlockInvalid}, {kBlock4x4, kBlock4x4}},
+    {{kBlock8x8, kBlock8x4}, {kBlock4x8, kBlock4x4}},
+    {{kBlock8x16, kBlock8x8}, {kBlockInvalid, kBlock4x8}},
+    {{kBlock8x32, kBlock8x16}, {kBlockInvalid, kBlock4x16}},
+    {{kBlock16x4, kBlockInvalid}, {kBlock8x4, kBlock8x4}},
+    {{kBlock16x8, kBlockInvalid}, {kBlock8x8, kBlock8x4}},
+    {{kBlock16x16, kBlock16x8}, {kBlock8x16, kBlock8x8}},
+    {{kBlock16x32, kBlock16x16}, {kBlockInvalid, kBlock8x16}},
+    {{kBlock16x64, kBlock16x32}, {kBlockInvalid, kBlock8x32}},
+    {{kBlock32x8, kBlockInvalid}, {kBlock16x8, kBlock16x4}},
+    {{kBlock32x16, kBlockInvalid}, {kBlock16x16, kBlock16x8}},
+    {{kBlock32x32, kBlock32x16}, {kBlock16x32, kBlock16x16}},
+    {{kBlock32x64, kBlock32x32}, {kBlockInvalid, kBlock16x32}},
+    {{kBlock64x16, kBlockInvalid}, {kBlock32x16, kBlock32x8}},
+    {{kBlock64x32, kBlockInvalid}, {kBlock32x32, kBlock32x16}},
+    {{kBlock64x64, kBlock64x32}, {kBlock32x64, kBlock32x32}},
+    {{kBlock64x128, kBlock64x64}, {kBlockInvalid, kBlock32x64}},
+    {{kBlock128x64, kBlockInvalid}, {kBlock64x64, kBlock64x32}},
+    {{kBlock128x128, kBlock128x64}, {kBlock64x128, kBlock64x64}}};
+
+const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1] = {
+    0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+    1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
+    744,  712,   682,  655,  630,  606,  585,  564,  546,  528};
+
+const uint8_t kTransformWidth[kNumTransformSizes] = {
+    4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 32, 32, 32, 32, 64, 64, 64};
+
+const uint8_t kTransformHeight[kNumTransformSizes] = {
+    4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, 64, 8, 16, 32, 64, 16, 32, 64};
+
+const uint8_t kTransformWidth4x4[kNumTransformSizes] = {
+    1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16};
+
+const uint8_t kTransformHeight4x4[kNumTransformSizes] = {
+    1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16};
+
+const uint8_t kTransformWidthLog2[kNumTransformSizes] = {
+    2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6};
+
+const uint8_t kTransformHeightLog2[kNumTransformSizes] = {
+    2, 3, 4, 2, 3, 4, 5, 2, 3, 4, 5, 6, 3, 4, 5, 6, 4, 5, 6};
+
+// 9.3 -- Split_Tx_Size[]
+const TransformSize kSplitTransformSize[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize4x8,
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize8x8,
+    kTransformSize8x16,  kTransformSize8x4,   kTransformSize8x8,
+    kTransformSize8x8,   kTransformSize16x16, kTransformSize16x32,
+    kTransformSize16x8,  kTransformSize16x16, kTransformSize16x16,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32};
+
+// Square transform of size min(w,h).
+const TransformSize kTransformSizeSquareMin[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize4x4,
+    kTransformSize4x4,   kTransformSize8x8,   kTransformSize8x8,
+    kTransformSize8x8,   kTransformSize4x4,   kTransformSize8x8,
+    kTransformSize16x16, kTransformSize16x16, kTransformSize16x16,
+    kTransformSize8x8,   kTransformSize16x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize16x16, kTransformSize32x32,
+    kTransformSize64x64};
+
+// Square transform of size max(w,h).
+const TransformSize kTransformSizeSquareMax[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize8x8,   kTransformSize16x16,
+    kTransformSize8x8,   kTransformSize8x8,   kTransformSize16x16,
+    kTransformSize32x32, kTransformSize16x16, kTransformSize16x16,
+    kTransformSize16x16, kTransformSize32x32, kTransformSize64x64,
+    kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+    kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+    kTransformSize64x64};
+
+const uint8_t kNumTransformTypesInSet[kNumTransformSets] = {1, 7, 5, 16, 12, 2};
+
+const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4] = {
+    {2, 12, 1, 4},  {2, 15, 1, 6},  {2, 18, 1, 8},  {2, 21, 1, 9},
+    {2, 24, 1, 10}, {2, 29, 1, 11}, {2, 36, 1, 12}, {2, 45, 1, 13},
+    {2, 56, 1, 14}, {2, 68, 1, 15}, {0, 0, 1, 5},   {0, 0, 1, 8},
+    {0, 0, 1, 11},  {0, 0, 1, 14},  {2, 30, 0, 0},  {2, 75, 0, 0}};
+
+const int8_t kSgrProjMultiplierMin[2] = {-96, -32};
+
+const int8_t kSgrProjMultiplierMax[2] = {31, 95};
+
+const int8_t kWienerTapsMin[3] = {-5, -23, -17};
+
+const int8_t kWienerTapsMax[3] = {10, 8, 46};
+
+// This was modified from Upscale_Filter as defined in AV1 Section 7.16, in
+// order to support 16-bit packed NEON operations.
+// The sign of each tap is: - + - + + - + -
+alignas(16) const uint8_t
+    kUpscaleFilterUnsigned[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, 128, 0, 0, 0, 0},    {0, 0, 1, 128, 2, 1, 0, 0},
+        {0, 1, 3, 127, 4, 2, 1, 0},    {0, 1, 4, 127, 6, 3, 1, 0},
+        {0, 2, 6, 126, 8, 3, 1, 0},    {0, 2, 7, 125, 11, 4, 1, 0},
+        {1, 2, 8, 125, 13, 5, 2, 0},   {1, 3, 9, 124, 15, 6, 2, 0},
+        {1, 3, 10, 123, 18, 6, 2, 1},  {1, 3, 11, 122, 20, 7, 3, 1},
+        {1, 4, 12, 121, 22, 8, 3, 1},  {1, 4, 13, 120, 25, 9, 3, 1},
+        {1, 4, 14, 118, 28, 9, 3, 1},  {1, 4, 15, 117, 30, 10, 4, 1},
+        {1, 5, 16, 116, 32, 11, 4, 1}, {1, 5, 16, 114, 35, 12, 4, 1},
+        {1, 5, 17, 112, 38, 12, 4, 1}, {1, 5, 18, 111, 40, 13, 5, 1},
+        {1, 5, 18, 109, 43, 14, 5, 1}, {1, 6, 19, 107, 45, 14, 5, 1},
+        {1, 6, 19, 105, 48, 15, 5, 1}, {1, 6, 19, 103, 51, 16, 5, 1},
+        {1, 6, 20, 101, 53, 16, 6, 1}, {1, 6, 20, 99, 56, 17, 6, 1},
+        {1, 6, 20, 97, 58, 17, 6, 1},  {1, 6, 20, 95, 61, 18, 6, 1},
+        {2, 7, 20, 93, 64, 18, 6, 2},  {2, 7, 20, 91, 66, 19, 6, 1},
+        {2, 7, 20, 88, 69, 19, 6, 1},  {2, 7, 20, 86, 71, 19, 6, 1},
+        {2, 7, 20, 84, 74, 20, 7, 2},  {2, 7, 20, 81, 76, 20, 7, 1},
+        {2, 7, 20, 79, 79, 20, 7, 2},  {1, 7, 20, 76, 81, 20, 7, 2},
+        {2, 7, 20, 74, 84, 20, 7, 2},  {1, 6, 19, 71, 86, 20, 7, 2},
+        {1, 6, 19, 69, 88, 20, 7, 2},  {1, 6, 19, 66, 91, 20, 7, 2},
+        {2, 6, 18, 64, 93, 20, 7, 2},  {1, 6, 18, 61, 95, 20, 6, 1},
+        {1, 6, 17, 58, 97, 20, 6, 1},  {1, 6, 17, 56, 99, 20, 6, 1},
+        {1, 6, 16, 53, 101, 20, 6, 1}, {1, 5, 16, 51, 103, 19, 6, 1},
+        {1, 5, 15, 48, 105, 19, 6, 1}, {1, 5, 14, 45, 107, 19, 6, 1},
+        {1, 5, 14, 43, 109, 18, 5, 1}, {1, 5, 13, 40, 111, 18, 5, 1},
+        {1, 4, 12, 38, 112, 17, 5, 1}, {1, 4, 12, 35, 114, 16, 5, 1},
+        {1, 4, 11, 32, 116, 16, 5, 1}, {1, 4, 10, 30, 117, 15, 4, 1},
+        {1, 3, 9, 28, 118, 14, 4, 1},  {1, 3, 9, 25, 120, 13, 4, 1},
+        {1, 3, 8, 22, 121, 12, 4, 1},  {1, 3, 7, 20, 122, 11, 3, 1},
+        {1, 2, 6, 18, 123, 10, 3, 1},  {0, 2, 6, 15, 124, 9, 3, 1},
+        {0, 2, 5, 13, 125, 8, 2, 1},   {0, 1, 4, 11, 125, 7, 2, 0},
+        {0, 1, 3, 8, 126, 6, 2, 0},    {0, 1, 3, 6, 127, 4, 1, 0},
+        {0, 1, 2, 4, 127, 3, 1, 0},    {0, 0, 1, 2, 128, 1, 0, 0},
+};
+
+alignas(8) const int8_t
+    kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+        // [-1, 0).
+        {0, 0, 127, 1, 0, 0, 0, 0},
+        {0, -1, 127, 2, 0, 0, 0, 0},
+        {1, -3, 127, 4, -1, 0, 0, 0},
+        {1, -4, 126, 6, -2, 1, 0, 0},
+        {1, -5, 126, 8, -3, 1, 0, 0},
+        {1, -6, 125, 11, -4, 1, 0, 0},
+        {1, -7, 124, 13, -4, 1, 0, 0},
+        {2, -8, 123, 15, -5, 1, 0, 0},
+        {2, -9, 122, 18, -6, 1, 0, 0},
+        {2, -10, 121, 20, -6, 1, 0, 0},
+        {2, -11, 120, 22, -7, 2, 0, 0},
+        {2, -12, 119, 25, -8, 2, 0, 0},
+        {3, -13, 117, 27, -8, 2, 0, 0},
+        {3, -13, 116, 29, -9, 2, 0, 0},
+        {3, -14, 114, 32, -10, 3, 0, 0},
+        {3, -15, 113, 35, -10, 2, 0, 0},
+        {3, -15, 111, 37, -11, 3, 0, 0},
+        {3, -16, 109, 40, -11, 3, 0, 0},
+        {3, -16, 108, 42, -12, 3, 0, 0},
+        {4, -17, 106, 45, -13, 3, 0, 0},
+        {4, -17, 104, 47, -13, 3, 0, 0},
+        {4, -17, 102, 50, -14, 3, 0, 0},
+        {4, -17, 100, 52, -14, 3, 0, 0},
+        {4, -18, 98, 55, -15, 4, 0, 0},
+        {4, -18, 96, 58, -15, 3, 0, 0},
+        {4, -18, 94, 60, -16, 4, 0, 0},
+        {4, -18, 91, 63, -16, 4, 0, 0},
+        {4, -18, 89, 65, -16, 4, 0, 0},
+        {4, -18, 87, 68, -17, 4, 0, 0},
+        {4, -18, 85, 70, -17, 4, 0, 0},
+        {4, -18, 82, 73, -17, 4, 0, 0},
+        {4, -18, 80, 75, -17, 4, 0, 0},
+        {4, -18, 78, 78, -18, 4, 0, 0},
+        {4, -17, 75, 80, -18, 4, 0, 0},
+        {4, -17, 73, 82, -18, 4, 0, 0},
+        {4, -17, 70, 85, -18, 4, 0, 0},
+        {4, -17, 68, 87, -18, 4, 0, 0},
+        {4, -16, 65, 89, -18, 4, 0, 0},
+        {4, -16, 63, 91, -18, 4, 0, 0},
+        {4, -16, 60, 94, -18, 4, 0, 0},
+        {3, -15, 58, 96, -18, 4, 0, 0},
+        {4, -15, 55, 98, -18, 4, 0, 0},
+        {3, -14, 52, 100, -17, 4, 0, 0},
+        {3, -14, 50, 102, -17, 4, 0, 0},
+        {3, -13, 47, 104, -17, 4, 0, 0},
+        {3, -13, 45, 106, -17, 4, 0, 0},
+        {3, -12, 42, 108, -16, 3, 0, 0},
+        {3, -11, 40, 109, -16, 3, 0, 0},
+        {3, -11, 37, 111, -15, 3, 0, 0},
+        {2, -10, 35, 113, -15, 3, 0, 0},
+        {3, -10, 32, 114, -14, 3, 0, 0},
+        {2, -9, 29, 116, -13, 3, 0, 0},
+        {2, -8, 27, 117, -13, 3, 0, 0},
+        {2, -8, 25, 119, -12, 2, 0, 0},
+        {2, -7, 22, 120, -11, 2, 0, 0},
+        {1, -6, 20, 121, -10, 2, 0, 0},
+        {1, -6, 18, 122, -9, 2, 0, 0},
+        {1, -5, 15, 123, -8, 2, 0, 0},
+        {1, -4, 13, 124, -7, 1, 0, 0},
+        {1, -4, 11, 125, -6, 1, 0, 0},
+        {1, -3, 8, 126, -5, 1, 0, 0},
+        {1, -2, 6, 126, -4, 1, 0, 0},
+        {0, -1, 4, 127, -3, 1, 0, 0},
+        {0, 0, 2, 127, -1, 0, 0, 0},
+        // [0, 1).
+        {0, 0, 0, 127, 1, 0, 0, 0},
+        {0, 0, -1, 127, 2, 0, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},
+        {0, 1, -5, 127, 6, -2, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},
+        {-1, 2, -7, 126, 11, -4, 2, -1},
+        {-1, 3, -8, 125, 13, -5, 2, -1},
+        {-1, 3, -10, 124, 16, -6, 3, -1},
+        {-1, 4, -11, 123, 18, -7, 3, -1},
+        {-1, 4, -12, 122, 20, -7, 3, -1},
+        {-1, 4, -13, 121, 23, -8, 3, -1},
+        {-2, 5, -14, 120, 25, -9, 4, -1},
+        {-1, 5, -15, 119, 27, -10, 4, -1},
+        {-1, 5, -16, 118, 30, -11, 4, -1},
+        {-2, 6, -17, 116, 33, -12, 5, -1},
+        {-2, 6, -17, 114, 35, -12, 5, -1},
+        {-2, 6, -18, 113, 38, -13, 5, -1},
+        {-2, 7, -19, 111, 41, -14, 6, -2},
+        {-2, 7, -19, 110, 43, -15, 6, -2},
+        {-2, 7, -20, 108, 46, -15, 6, -2},
+        {-2, 7, -20, 106, 49, -16, 6, -2},
+        {-2, 7, -21, 104, 51, -16, 7, -2},
+        {-2, 7, -21, 102, 54, -17, 7, -2},
+        {-2, 8, -21, 100, 56, -18, 7, -2},
+        {-2, 8, -22, 98, 59, -18, 7, -2},
+        {-2, 8, -22, 96, 62, -19, 7, -2},
+        {-2, 8, -22, 94, 64, -19, 7, -2},
+        {-2, 8, -22, 91, 67, -20, 8, -2},
+        {-2, 8, -22, 89, 69, -20, 8, -2},
+        {-2, 8, -22, 87, 72, -21, 8, -2},
+        {-2, 8, -21, 84, 74, -21, 8, -2},
+        {-2, 8, -22, 82, 77, -21, 8, -2},
+        {-2, 8, -21, 79, 79, -21, 8, -2},
+        {-2, 8, -21, 77, 82, -22, 8, -2},
+        {-2, 8, -21, 74, 84, -21, 8, -2},
+        {-2, 8, -21, 72, 87, -22, 8, -2},
+        {-2, 8, -20, 69, 89, -22, 8, -2},
+        {-2, 8, -20, 67, 91, -22, 8, -2},
+        {-2, 7, -19, 64, 94, -22, 8, -2},
+        {-2, 7, -19, 62, 96, -22, 8, -2},
+        {-2, 7, -18, 59, 98, -22, 8, -2},
+        {-2, 7, -18, 56, 100, -21, 8, -2},
+        {-2, 7, -17, 54, 102, -21, 7, -2},
+        {-2, 7, -16, 51, 104, -21, 7, -2},
+        {-2, 6, -16, 49, 106, -20, 7, -2},
+        {-2, 6, -15, 46, 108, -20, 7, -2},
+        {-2, 6, -15, 43, 110, -19, 7, -2},
+        {-2, 6, -14, 41, 111, -19, 7, -2},
+        {-1, 5, -13, 38, 113, -18, 6, -2},
+        {-1, 5, -12, 35, 114, -17, 6, -2},
+        {-1, 5, -12, 33, 116, -17, 6, -2},
+        {-1, 4, -11, 30, 118, -16, 5, -1},
+        {-1, 4, -10, 27, 119, -15, 5, -1},
+        {-1, 4, -9, 25, 120, -14, 5, -2},
+        {-1, 3, -8, 23, 121, -13, 4, -1},
+        {-1, 3, -7, 20, 122, -12, 4, -1},
+        {-1, 3, -7, 18, 123, -11, 4, -1},
+        {-1, 3, -6, 16, 124, -10, 3, -1},
+        {-1, 2, -5, 13, 125, -8, 3, -1},
+        {-1, 2, -4, 11, 126, -7, 2, -1},
+        {0, 1, -3, 8, 126, -6, 2, 0},
+        {0, 1, -2, 6, 127, -5, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},
+        {0, 0, 0, 2, 127, -1, 0, 0},
+        // [1, 2).
+        {0, 0, 0, 1, 127, 0, 0, 0},
+        {0, 0, 0, -1, 127, 2, 0, 0},
+        {0, 0, 1, -3, 127, 4, -1, 0},
+        {0, 0, 1, -4, 126, 6, -2, 1},
+        {0, 0, 1, -5, 126, 8, -3, 1},
+        {0, 0, 1, -6, 125, 11, -4, 1},
+        {0, 0, 1, -7, 124, 13, -4, 1},
+        {0, 0, 2, -8, 123, 15, -5, 1},
+        {0, 0, 2, -9, 122, 18, -6, 1},
+        {0, 0, 2, -10, 121, 20, -6, 1},
+        {0, 0, 2, -11, 120, 22, -7, 2},
+        {0, 0, 2, -12, 119, 25, -8, 2},
+        {0, 0, 3, -13, 117, 27, -8, 2},
+        {0, 0, 3, -13, 116, 29, -9, 2},
+        {0, 0, 3, -14, 114, 32, -10, 3},
+        {0, 0, 3, -15, 113, 35, -10, 2},
+        {0, 0, 3, -15, 111, 37, -11, 3},
+        {0, 0, 3, -16, 109, 40, -11, 3},
+        {0, 0, 3, -16, 108, 42, -12, 3},
+        {0, 0, 4, -17, 106, 45, -13, 3},
+        {0, 0, 4, -17, 104, 47, -13, 3},
+        {0, 0, 4, -17, 102, 50, -14, 3},
+        {0, 0, 4, -17, 100, 52, -14, 3},
+        {0, 0, 4, -18, 98, 55, -15, 4},
+        {0, 0, 4, -18, 96, 58, -15, 3},
+        {0, 0, 4, -18, 94, 60, -16, 4},
+        {0, 0, 4, -18, 91, 63, -16, 4},
+        {0, 0, 4, -18, 89, 65, -16, 4},
+        {0, 0, 4, -18, 87, 68, -17, 4},
+        {0, 0, 4, -18, 85, 70, -17, 4},
+        {0, 0, 4, -18, 82, 73, -17, 4},
+        {0, 0, 4, -18, 80, 75, -17, 4},
+        {0, 0, 4, -18, 78, 78, -18, 4},
+        {0, 0, 4, -17, 75, 80, -18, 4},
+        {0, 0, 4, -17, 73, 82, -18, 4},
+        {0, 0, 4, -17, 70, 85, -18, 4},
+        {0, 0, 4, -17, 68, 87, -18, 4},
+        {0, 0, 4, -16, 65, 89, -18, 4},
+        {0, 0, 4, -16, 63, 91, -18, 4},
+        {0, 0, 4, -16, 60, 94, -18, 4},
+        {0, 0, 3, -15, 58, 96, -18, 4},
+        {0, 0, 4, -15, 55, 98, -18, 4},
+        {0, 0, 3, -14, 52, 100, -17, 4},
+        {0, 0, 3, -14, 50, 102, -17, 4},
+        {0, 0, 3, -13, 47, 104, -17, 4},
+        {0, 0, 3, -13, 45, 106, -17, 4},
+        {0, 0, 3, -12, 42, 108, -16, 3},
+        {0, 0, 3, -11, 40, 109, -16, 3},
+        {0, 0, 3, -11, 37, 111, -15, 3},
+        {0, 0, 2, -10, 35, 113, -15, 3},
+        {0, 0, 3, -10, 32, 114, -14, 3},
+        {0, 0, 2, -9, 29, 116, -13, 3},
+        {0, 0, 2, -8, 27, 117, -13, 3},
+        {0, 0, 2, -8, 25, 119, -12, 2},
+        {0, 0, 2, -7, 22, 120, -11, 2},
+        {0, 0, 1, -6, 20, 121, -10, 2},
+        {0, 0, 1, -6, 18, 122, -9, 2},
+        {0, 0, 1, -5, 15, 123, -8, 2},
+        {0, 0, 1, -4, 13, 124, -7, 1},
+        {0, 0, 1, -4, 11, 125, -6, 1},
+        {0, 0, 1, -3, 8, 126, -5, 1},
+        {0, 0, 1, -2, 6, 126, -4, 1},
+        {0, 0, 0, -1, 4, 127, -3, 1},
+        {0, 0, 0, 0, 2, 127, -1, 0},
+        // dummy, replicate row index 191.
+        {0, 0, 0, 0, 2, 127, -1, 0}};
+
+alignas(16) const int16_t
+    kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+        // [-1, 0).
+        {0, 0, 127, 1, 0, 0, 0, 0},
+        {0, -1, 127, 2, 0, 0, 0, 0},
+        {1, -3, 127, 4, -1, 0, 0, 0},
+        {1, -4, 126, 6, -2, 1, 0, 0},
+        {1, -5, 126, 8, -3, 1, 0, 0},
+        {1, -6, 125, 11, -4, 1, 0, 0},
+        {1, -7, 124, 13, -4, 1, 0, 0},
+        {2, -8, 123, 15, -5, 1, 0, 0},
+        {2, -9, 122, 18, -6, 1, 0, 0},
+        {2, -10, 121, 20, -6, 1, 0, 0},
+        {2, -11, 120, 22, -7, 2, 0, 0},
+        {2, -12, 119, 25, -8, 2, 0, 0},
+        {3, -13, 117, 27, -8, 2, 0, 0},
+        {3, -13, 116, 29, -9, 2, 0, 0},
+        {3, -14, 114, 32, -10, 3, 0, 0},
+        {3, -15, 113, 35, -10, 2, 0, 0},
+        {3, -15, 111, 37, -11, 3, 0, 0},
+        {3, -16, 109, 40, -11, 3, 0, 0},
+        {3, -16, 108, 42, -12, 3, 0, 0},
+        {4, -17, 106, 45, -13, 3, 0, 0},
+        {4, -17, 104, 47, -13, 3, 0, 0},
+        {4, -17, 102, 50, -14, 3, 0, 0},
+        {4, -17, 100, 52, -14, 3, 0, 0},
+        {4, -18, 98, 55, -15, 4, 0, 0},
+        {4, -18, 96, 58, -15, 3, 0, 0},
+        {4, -18, 94, 60, -16, 4, 0, 0},
+        {4, -18, 91, 63, -16, 4, 0, 0},
+        {4, -18, 89, 65, -16, 4, 0, 0},
+        {4, -18, 87, 68, -17, 4, 0, 0},
+        {4, -18, 85, 70, -17, 4, 0, 0},
+        {4, -18, 82, 73, -17, 4, 0, 0},
+        {4, -18, 80, 75, -17, 4, 0, 0},
+        {4, -18, 78, 78, -18, 4, 0, 0},
+        {4, -17, 75, 80, -18, 4, 0, 0},
+        {4, -17, 73, 82, -18, 4, 0, 0},
+        {4, -17, 70, 85, -18, 4, 0, 0},
+        {4, -17, 68, 87, -18, 4, 0, 0},
+        {4, -16, 65, 89, -18, 4, 0, 0},
+        {4, -16, 63, 91, -18, 4, 0, 0},
+        {4, -16, 60, 94, -18, 4, 0, 0},
+        {3, -15, 58, 96, -18, 4, 0, 0},
+        {4, -15, 55, 98, -18, 4, 0, 0},
+        {3, -14, 52, 100, -17, 4, 0, 0},
+        {3, -14, 50, 102, -17, 4, 0, 0},
+        {3, -13, 47, 104, -17, 4, 0, 0},
+        {3, -13, 45, 106, -17, 4, 0, 0},
+        {3, -12, 42, 108, -16, 3, 0, 0},
+        {3, -11, 40, 109, -16, 3, 0, 0},
+        {3, -11, 37, 111, -15, 3, 0, 0},
+        {2, -10, 35, 113, -15, 3, 0, 0},
+        {3, -10, 32, 114, -14, 3, 0, 0},
+        {2, -9, 29, 116, -13, 3, 0, 0},
+        {2, -8, 27, 117, -13, 3, 0, 0},
+        {2, -8, 25, 119, -12, 2, 0, 0},
+        {2, -7, 22, 120, -11, 2, 0, 0},
+        {1, -6, 20, 121, -10, 2, 0, 0},
+        {1, -6, 18, 122, -9, 2, 0, 0},
+        {1, -5, 15, 123, -8, 2, 0, 0},
+        {1, -4, 13, 124, -7, 1, 0, 0},
+        {1, -4, 11, 125, -6, 1, 0, 0},
+        {1, -3, 8, 126, -5, 1, 0, 0},
+        {1, -2, 6, 126, -4, 1, 0, 0},
+        {0, -1, 4, 127, -3, 1, 0, 0},
+        {0, 0, 2, 127, -1, 0, 0, 0},
+        // [0, 1).
+        {0, 0, 0, 127, 1, 0, 0, 0},
+        {0, 0, -1, 127, 2, 0, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},
+        {0, 1, -5, 127, 6, -2, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},
+        {-1, 2, -7, 126, 11, -4, 2, -1},
+        {-1, 3, -8, 125, 13, -5, 2, -1},
+        {-1, 3, -10, 124, 16, -6, 3, -1},
+        {-1, 4, -11, 123, 18, -7, 3, -1},
+        {-1, 4, -12, 122, 20, -7, 3, -1},
+        {-1, 4, -13, 121, 23, -8, 3, -1},
+        {-2, 5, -14, 120, 25, -9, 4, -1},
+        {-1, 5, -15, 119, 27, -10, 4, -1},
+        {-1, 5, -16, 118, 30, -11, 4, -1},
+        {-2, 6, -17, 116, 33, -12, 5, -1},
+        {-2, 6, -17, 114, 35, -12, 5, -1},
+        {-2, 6, -18, 113, 38, -13, 5, -1},
+        {-2, 7, -19, 111, 41, -14, 6, -2},
+        {-2, 7, -19, 110, 43, -15, 6, -2},
+        {-2, 7, -20, 108, 46, -15, 6, -2},
+        {-2, 7, -20, 106, 49, -16, 6, -2},
+        {-2, 7, -21, 104, 51, -16, 7, -2},
+        {-2, 7, -21, 102, 54, -17, 7, -2},
+        {-2, 8, -21, 100, 56, -18, 7, -2},
+        {-2, 8, -22, 98, 59, -18, 7, -2},
+        {-2, 8, -22, 96, 62, -19, 7, -2},
+        {-2, 8, -22, 94, 64, -19, 7, -2},
+        {-2, 8, -22, 91, 67, -20, 8, -2},
+        {-2, 8, -22, 89, 69, -20, 8, -2},
+        {-2, 8, -22, 87, 72, -21, 8, -2},
+        {-2, 8, -21, 84, 74, -21, 8, -2},
+        {-2, 8, -22, 82, 77, -21, 8, -2},
+        {-2, 8, -21, 79, 79, -21, 8, -2},
+        {-2, 8, -21, 77, 82, -22, 8, -2},
+        {-2, 8, -21, 74, 84, -21, 8, -2},
+        {-2, 8, -21, 72, 87, -22, 8, -2},
+        {-2, 8, -20, 69, 89, -22, 8, -2},
+        {-2, 8, -20, 67, 91, -22, 8, -2},
+        {-2, 7, -19, 64, 94, -22, 8, -2},
+        {-2, 7, -19, 62, 96, -22, 8, -2},
+        {-2, 7, -18, 59, 98, -22, 8, -2},
+        {-2, 7, -18, 56, 100, -21, 8, -2},
+        {-2, 7, -17, 54, 102, -21, 7, -2},
+        {-2, 7, -16, 51, 104, -21, 7, -2},
+        {-2, 6, -16, 49, 106, -20, 7, -2},
+        {-2, 6, -15, 46, 108, -20, 7, -2},
+        {-2, 6, -15, 43, 110, -19, 7, -2},
+        {-2, 6, -14, 41, 111, -19, 7, -2},
+        {-1, 5, -13, 38, 113, -18, 6, -2},
+        {-1, 5, -12, 35, 114, -17, 6, -2},
+        {-1, 5, -12, 33, 116, -17, 6, -2},
+        {-1, 4, -11, 30, 118, -16, 5, -1},
+        {-1, 4, -10, 27, 119, -15, 5, -1},
+        {-1, 4, -9, 25, 120, -14, 5, -2},
+        {-1, 3, -8, 23, 121, -13, 4, -1},
+        {-1, 3, -7, 20, 122, -12, 4, -1},
+        {-1, 3, -7, 18, 123, -11, 4, -1},
+        {-1, 3, -6, 16, 124, -10, 3, -1},
+        {-1, 2, -5, 13, 125, -8, 3, -1},
+        {-1, 2, -4, 11, 126, -7, 2, -1},
+        {0, 1, -3, 8, 126, -6, 2, 0},
+        {0, 1, -2, 6, 127, -5, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},
+        {0, 0, 0, 2, 127, -1, 0, 0},
+        // [1, 2).
+        {0, 0, 0, 1, 127, 0, 0, 0},
+        {0, 0, 0, -1, 127, 2, 0, 0},
+        {0, 0, 1, -3, 127, 4, -1, 0},
+        {0, 0, 1, -4, 126, 6, -2, 1},
+        {0, 0, 1, -5, 126, 8, -3, 1},
+        {0, 0, 1, -6, 125, 11, -4, 1},
+        {0, 0, 1, -7, 124, 13, -4, 1},
+        {0, 0, 2, -8, 123, 15, -5, 1},
+        {0, 0, 2, -9, 122, 18, -6, 1},
+        {0, 0, 2, -10, 121, 20, -6, 1},
+        {0, 0, 2, -11, 120, 22, -7, 2},
+        {0, 0, 2, -12, 119, 25, -8, 2},
+        {0, 0, 3, -13, 117, 27, -8, 2},
+        {0, 0, 3, -13, 116, 29, -9, 2},
+        {0, 0, 3, -14, 114, 32, -10, 3},
+        {0, 0, 3, -15, 113, 35, -10, 2},
+        {0, 0, 3, -15, 111, 37, -11, 3},
+        {0, 0, 3, -16, 109, 40, -11, 3},
+        {0, 0, 3, -16, 108, 42, -12, 3},
+        {0, 0, 4, -17, 106, 45, -13, 3},
+        {0, 0, 4, -17, 104, 47, -13, 3},
+        {0, 0, 4, -17, 102, 50, -14, 3},
+        {0, 0, 4, -17, 100, 52, -14, 3},
+        {0, 0, 4, -18, 98, 55, -15, 4},
+        {0, 0, 4, -18, 96, 58, -15, 3},
+        {0, 0, 4, -18, 94, 60, -16, 4},
+        {0, 0, 4, -18, 91, 63, -16, 4},
+        {0, 0, 4, -18, 89, 65, -16, 4},
+        {0, 0, 4, -18, 87, 68, -17, 4},
+        {0, 0, 4, -18, 85, 70, -17, 4},
+        {0, 0, 4, -18, 82, 73, -17, 4},
+        {0, 0, 4, -18, 80, 75, -17, 4},
+        {0, 0, 4, -18, 78, 78, -18, 4},
+        {0, 0, 4, -17, 75, 80, -18, 4},
+        {0, 0, 4, -17, 73, 82, -18, 4},
+        {0, 0, 4, -17, 70, 85, -18, 4},
+        {0, 0, 4, -17, 68, 87, -18, 4},
+        {0, 0, 4, -16, 65, 89, -18, 4},
+        {0, 0, 4, -16, 63, 91, -18, 4},
+        {0, 0, 4, -16, 60, 94, -18, 4},
+        {0, 0, 3, -15, 58, 96, -18, 4},
+        {0, 0, 4, -15, 55, 98, -18, 4},
+        {0, 0, 3, -14, 52, 100, -17, 4},
+        {0, 0, 3, -14, 50, 102, -17, 4},
+        {0, 0, 3, -13, 47, 104, -17, 4},
+        {0, 0, 3, -13, 45, 106, -17, 4},
+        {0, 0, 3, -12, 42, 108, -16, 3},
+        {0, 0, 3, -11, 40, 109, -16, 3},
+        {0, 0, 3, -11, 37, 111, -15, 3},
+        {0, 0, 2, -10, 35, 113, -15, 3},
+        {0, 0, 3, -10, 32, 114, -14, 3},
+        {0, 0, 2, -9, 29, 116, -13, 3},
+        {0, 0, 2, -8, 27, 117, -13, 3},
+        {0, 0, 2, -8, 25, 119, -12, 2},
+        {0, 0, 2, -7, 22, 120, -11, 2},
+        {0, 0, 1, -6, 20, 121, -10, 2},
+        {0, 0, 1, -6, 18, 122, -9, 2},
+        {0, 0, 1, -5, 15, 123, -8, 2},
+        {0, 0, 1, -4, 13, 124, -7, 1},
+        {0, 0, 1, -4, 11, 125, -6, 1},
+        {0, 0, 1, -3, 8, 126, -5, 1},
+        {0, 0, 1, -2, 6, 126, -4, 1},
+        {0, 0, 0, -1, 4, 127, -3, 1},
+        {0, 0, 0, 0, 2, 127, -1, 0},
+        // dummy, replicate row index 191.
+        {0, 0, 0, 0, 2, 127, -1, 0}};
+
+// Every value in |kSubPixelFilters| is even. Divide by 2 to simplify
+// calculations by reducing the range by 1 bit.
+alignas(8) const int8_t kHalfSubPixelFilters[6][16][8] = {
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, -3, 63, 4, -1, 0, 0},
+     {0, 1, -5, 61, 9, -2, 0, 0},
+     {0, 1, -6, 58, 14, -4, 1, 0},
+     {0, 1, -7, 55, 19, -5, 1, 0},
+     {0, 1, -7, 51, 24, -6, 1, 0},
+     {0, 1, -8, 47, 29, -6, 1, 0},
+     {0, 1, -7, 42, 33, -6, 1, 0},
+     {0, 1, -7, 38, 38, -7, 1, 0},
+     {0, 1, -6, 33, 42, -7, 1, 0},
+     {0, 1, -6, 29, 47, -8, 1, 0},
+     {0, 1, -6, 24, 51, -7, 1, 0},
+     {0, 1, -5, 19, 55, -7, 1, 0},
+     {0, 1, -4, 14, 58, -6, 1, 0},
+     {0, 0, -2, 9, 61, -5, 1, 0},
+     {0, 0, -1, 4, 63, -3, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 14, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, -1, 8, 27, 24, 6, 0, 0},
+     {0, -1, 7, 26, 26, 7, -1, 0},
+     {0, 0, 6, 24, 27, 8, -1, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 14, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {-1, 1, -3, 63, 4, -1, 1, 0},
+     {-1, 3, -6, 62, 8, -3, 2, -1},
+     {-1, 4, -9, 60, 13, -5, 3, -1},
+     {-2, 5, -11, 58, 19, -7, 3, -1},
+     {-2, 5, -11, 54, 24, -9, 4, -1},
+     {-2, 5, -12, 50, 30, -10, 4, -1},
+     {-2, 5, -12, 45, 35, -11, 5, -1},
+     {-2, 6, -12, 40, 40, -12, 6, -2},
+     {-1, 5, -11, 35, 45, -12, 5, -2},
+     {-1, 4, -10, 30, 50, -12, 5, -2},
+     {-1, 4, -9, 24, 54, -11, 5, -2},
+     {-1, 3, -7, 19, 58, -11, 5, -2},
+     {-1, 3, -5, 13, 60, -9, 4, -1},
+     {-1, 2, -3, 8, 62, -6, 3, -1},
+     {0, 1, -1, 4, 63, -3, 1, -1}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 0, 60, 4, 0, 0, 0},
+     {0, 0, 0, 56, 8, 0, 0, 0},
+     {0, 0, 0, 52, 12, 0, 0, 0},
+     {0, 0, 0, 48, 16, 0, 0, 0},
+     {0, 0, 0, 44, 20, 0, 0, 0},
+     {0, 0, 0, 40, 24, 0, 0, 0},
+     {0, 0, 0, 36, 28, 0, 0, 0},
+     {0, 0, 0, 32, 32, 0, 0, 0},
+     {0, 0, 0, 28, 36, 0, 0, 0},
+     {0, 0, 0, 24, 40, 0, 0, 0},
+     {0, 0, 0, 20, 44, 0, 0, 0},
+     {0, 0, 0, 16, 48, 0, 0, 0},
+     {0, 0, 0, 12, 52, 0, 0, 0},
+     {0, 0, 0, 8, 56, 0, 0, 0},
+     {0, 0, 0, 4, 60, 0, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, -2, 63, 4, -1, 0, 0},
+     {0, 0, -4, 61, 9, -2, 0, 0},
+     {0, 0, -5, 58, 14, -3, 0, 0},
+     {0, 0, -6, 55, 19, -4, 0, 0},
+     {0, 0, -6, 51, 24, -5, 0, 0},
+     {0, 0, -7, 47, 29, -5, 0, 0},
+     {0, 0, -6, 42, 33, -5, 0, 0},
+     {0, 0, -6, 38, 38, -6, 0, 0},
+     {0, 0, -5, 33, 42, -6, 0, 0},
+     {0, 0, -5, 29, 47, -7, 0, 0},
+     {0, 0, -5, 24, 51, -6, 0, 0},
+     {0, 0, -4, 19, 55, -6, 0, 0},
+     {0, 0, -3, 14, 58, -5, 0, 0},
+     {0, 0, -2, 9, 61, -4, 0, 0},
+     {0, 0, -1, 4, 63, -2, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 15, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 0, 7, 27, 24, 6, 0, 0},
+     {0, 0, 6, 26, 26, 6, 0, 0},
+     {0, 0, 6, 24, 27, 7, 0, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// Absolute values of |kHalfSubPixelFilters|. Used in situations where we know
+// the pattern of the signs and account for it in other ways.
+const uint8_t kAbsHalfSubPixelFilters[6][16][8] = {
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 3, 63, 4, 1, 0, 0},
+     {0, 1, 5, 61, 9, 2, 0, 0},
+     {0, 1, 6, 58, 14, 4, 1, 0},
+     {0, 1, 7, 55, 19, 5, 1, 0},
+     {0, 1, 7, 51, 24, 6, 1, 0},
+     {0, 1, 8, 47, 29, 6, 1, 0},
+     {0, 1, 7, 42, 33, 6, 1, 0},
+     {0, 1, 7, 38, 38, 7, 1, 0},
+     {0, 1, 6, 33, 42, 7, 1, 0},
+     {0, 1, 6, 29, 47, 8, 1, 0},
+     {0, 1, 6, 24, 51, 7, 1, 0},
+     {0, 1, 5, 19, 55, 7, 1, 0},
+     {0, 1, 4, 14, 58, 6, 1, 0},
+     {0, 0, 2, 9, 61, 5, 1, 0},
+     {0, 0, 1, 4, 63, 3, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 14, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 1, 8, 27, 24, 6, 0, 0},
+     {0, 1, 7, 26, 26, 7, 1, 0},
+     {0, 0, 6, 24, 27, 8, 1, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 14, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {1, 1, 3, 63, 4, 1, 1, 0},
+     {1, 3, 6, 62, 8, 3, 2, 1},
+     {1, 4, 9, 60, 13, 5, 3, 1},
+     {2, 5, 11, 58, 19, 7, 3, 1},
+     {2, 5, 11, 54, 24, 9, 4, 1},
+     {2, 5, 12, 50, 30, 10, 4, 1},
+     {2, 5, 12, 45, 35, 11, 5, 1},
+     {2, 6, 12, 40, 40, 12, 6, 2},
+     {1, 5, 11, 35, 45, 12, 5, 2},
+     {1, 4, 10, 30, 50, 12, 5, 2},
+     {1, 4, 9, 24, 54, 11, 5, 2},
+     {1, 3, 7, 19, 58, 11, 5, 2},
+     {1, 3, 5, 13, 60, 9, 4, 1},
+     {1, 2, 3, 8, 62, 6, 3, 1},
+     {0, 1, 1, 4, 63, 3, 1, 1}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 0, 60, 4, 0, 0, 0},
+     {0, 0, 0, 56, 8, 0, 0, 0},
+     {0, 0, 0, 52, 12, 0, 0, 0},
+     {0, 0, 0, 48, 16, 0, 0, 0},
+     {0, 0, 0, 44, 20, 0, 0, 0},
+     {0, 0, 0, 40, 24, 0, 0, 0},
+     {0, 0, 0, 36, 28, 0, 0, 0},
+     {0, 0, 0, 32, 32, 0, 0, 0},
+     {0, 0, 0, 28, 36, 0, 0, 0},
+     {0, 0, 0, 24, 40, 0, 0, 0},
+     {0, 0, 0, 20, 44, 0, 0, 0},
+     {0, 0, 0, 16, 48, 0, 0, 0},
+     {0, 0, 0, 12, 52, 0, 0, 0},
+     {0, 0, 0, 8, 56, 0, 0, 0},
+     {0, 0, 0, 4, 60, 0, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 2, 63, 4, 1, 0, 0},
+     {0, 0, 4, 61, 9, 2, 0, 0},
+     {0, 0, 5, 58, 14, 3, 0, 0},
+     {0, 0, 6, 55, 19, 4, 0, 0},
+     {0, 0, 6, 51, 24, 5, 0, 0},
+     {0, 0, 7, 47, 29, 5, 0, 0},
+     {0, 0, 6, 42, 33, 5, 0, 0},
+     {0, 0, 6, 38, 38, 6, 0, 0},
+     {0, 0, 5, 33, 42, 6, 0, 0},
+     {0, 0, 5, 29, 47, 7, 0, 0},
+     {0, 0, 5, 24, 51, 6, 0, 0},
+     {0, 0, 4, 19, 55, 6, 0, 0},
+     {0, 0, 3, 14, 58, 5, 0, 0},
+     {0, 0, 2, 9, 61, 4, 0, 0},
+     {0, 0, 1, 4, 63, 2, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 15, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 0, 7, 27, 24, 6, 0, 0},
+     {0, 0, 6, 26, 26, 6, 0, 0},
+     {0, 0, 6, 24, 27, 7, 0, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// 9.3 -- Dr_Intra_Derivative[]
+// This is a more compact version of the table from the spec. angle / 2 - 1 is
+// used as the lookup. Note angle / 3 - 1 would work too, but the calculation
+// becomes more costly.
+const int16_t kDirectionalIntraPredictorDerivative[44] = {
+    //              Approx angle
+    1023, 0,     // 3, ...
+    547,         // 6, ...
+    372,  0, 0,  // 9, ...
+    273,         // 14, ...
+    215,  0,     // 17, ...
+    178,         // 20, ...
+    151,  0,     // 23, ... (113 & 203 are base angles)
+    132,         // 26, ...
+    116,  0,     // 29, ...
+    102,  0,     // 32, ...
+    90,          // 36, ...
+    80,   0,     // 39, ...
+    71,          // 42, ...
+    64,   0,     // 45, ... (45 & 135 are base angles)
+    57,          // 48, ...
+    51,   0,     // 51, ...
+    45,   0,     // 54, ...
+    40,          // 58, ...
+    35,   0,     // 61, ...
+    31,          // 64, ...
+    27,   0,     // 67, ... (67 & 157 are base angles)
+    23,          // 70, ...
+    19,   0,     // 73, ...
+    15,   0,     // 76, ...
+    11,   0,     // 81, ...
+    7,           // 84, ...
+    3,           // 87, ...
+};
+
+const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = {
+    {0, 1}, {2, 2}, {3, 3}};
+
+}  // namespace libgav1
diff --git a/src/utils/constants.h b/src/utils/constants.h
new file mode 100644
index 0000000..34cf56d
--- /dev/null
+++ b/src/utils/constants.h
@@ -0,0 +1,744 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CONSTANTS_H_
+#define LIBGAV1_SRC_UTILS_CONSTANTS_H_
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/utils/bit_mask_set.h"
+
+namespace libgav1 {
+
+// Returns the number of elements between begin (inclusive) and end (inclusive).
+constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; }
+
+enum {
+// Maximum number of threads that the library will ever create.
+#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0
+  kMaxThreads = LIBGAV1_MAX_THREADS
+#else
+  kMaxThreads = 128
+#endif
+};  // anonymous enum
+
+enum {
+  kInvalidMvValue = -32768,
+  kCdfMaxProbability = 32768,
+  kBlockWidthCount = 5,
+  kMaxSegments = 8,
+  kMinQuantizer = 0,
+  kMinLossyQuantizer = 1,
+  kMaxQuantizer = 255,
+  // Quantizer matrix is used only when level < 15.
+  kNumQuantizerLevelsForQuantizerMatrix = 15,
+  kFrameLfCount = 4,
+  kMaxLoopFilterValue = 63,
+  kNum4x4In64x64 = 256,
+  kMaxAngleDelta = 3,
+  kDirectionalIntraModes = 8,
+  kMaxSuperBlockSizeLog2 = 7,
+  kMinSuperBlockSizeLog2 = 6,
+  kGlobalMotionReadControl = 3,
+  kSuperResScaleNumerator = 8,
+  kBooleanSymbolCount = 2,
+  kRestorationTypeSymbolCount = 3,
+  kSgrProjParamsBits = 4,
+  kSgrProjPrecisionBits = 7,
+  // Padding on left and right side of a restoration block.
+  // 3 is enough, but padding to 4 is more efficient, and makes the temporary
+  // source buffer 8-pixel aligned.
+  kRestorationHorizontalBorder = 4,
+  // Padding on top and bottom side of a restoration block.
+  kRestorationVerticalBorder = 2,
+  kCdefBorder = 2,             // Padding on each side of a cdef block.
+  kConvolveBorderLeftTop = 3,  // Left/top padding of a convolve block.
+  // Right/bottom padding of a convolve block. This needs to be 4 at minimum,
+  // but was increased to simplify the SIMD loads in
+  // ConvolveCompoundScale2D_NEON() and ConvolveScale2D_NEON().
+  kConvolveBorderRight = 8,
+  kConvolveBorderBottom = 4,
+  kSubPixelTaps = 8,
+  kWienerFilterBits = 7,
+  kWienerFilterTaps = 7,
+  kMaxPaletteSize = 8,
+  kMinPaletteSize = 2,
+  kMaxPaletteSquare = 64,
+  kBorderPixels = 64,
+  // The final blending process for film grain needs room to overwrite and read
+  // with SIMD instructions. The maximum overwrite is 7 pixels, but the border
+  // is required to be a multiple of 32 by YuvBuffer::Realloc, so that
+  // subsampled chroma borders are 16-aligned.
+  kBorderPixelsFilmGrain = 32,
+  // These constants are the minimum left, right, top, and bottom border sizes
+  // in pixels as an extension of the frame boundary. The minimum border sizes
+  // are derived from the following requirements:
+  // - Warp_C() may read up to 13 pixels before or after a row.
+  // - Warp_NEON() may read up to 13 pixels before a row. It may read up to 14
+  //   pixels after a row, but the value of the last read pixel is not used.
+  // - Warp_C() and Warp_NEON() may read up to 13 pixels above the top row and
+  //   13 pixels below the bottom row.
+  kMinLeftBorderPixels = 13,
+  kMinRightBorderPixels = 13,
+  kMinTopBorderPixels = 13,
+  kMinBottomBorderPixels = 13,
+  kWarpedModelPrecisionBits = 16,
+  kMaxRefMvStackSize = 8,
+  kMaxLeastSquaresSamples = 8,
+  kMaxTemporalMvCandidates = 19,
+  // The SIMD implementations of motion vection projection functions always
+  // process 2 or 4 elements together, so we pad the corresponding buffers to
+  // size 20.
+  kMaxTemporalMvCandidatesWithPadding = 20,
+  kMaxSuperBlockSizeInPixels = 128,
+  kMaxScaledSuperBlockSizeInPixels = 128 * 2,
+  kMaxSuperBlockSizeSquareInPixels = 128 * 128,
+  kNum4x4InLoopFilterUnit = 16,
+  kNum4x4InLoopRestorationUnit = 16,
+  kProjectionMvClamp = (1 << 14) - 1,  // == 16383
+  kProjectionMvMaxHorizontalOffset = 8,
+  kCdefUnitSize = 64,
+  kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kCdefBorder,
+  kRestorationUnitOffset = 8,
+  // Loop restoration's processing unit size is fixed as 64x64.
+  kRestorationUnitHeight = 64,
+  kRestorationUnitWidth = 256,
+  kRestorationUnitHeightWithBorders =
+      kRestorationUnitHeight + 2 * kRestorationVerticalBorder,
+  kRestorationUnitWidthWithBorders =
+      kRestorationUnitWidth + 2 * kRestorationHorizontalBorder,
+  kSuperResFilterBits = 6,
+  kSuperResFilterShifts = 1 << kSuperResFilterBits,
+  kSuperResFilterTaps = 8,
+  kSuperResScaleBits = 14,
+  kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits,
+  kSuperResScaleMask = (1 << 14) - 1,
+  kSuperResHorizontalBorder = 4,
+  kSuperResVerticalBorder = 1,
+  // The SIMD implementations of superres calculate up to 15 extra upscaled
+  // pixels which will over-read up to 15 downscaled pixels in the end of each
+  // row. Set the padding to 16 for alignment purposes.
+  kSuperResHorizontalPadding = 16,
+  // TODO(chengchen): consider merging these constants:
+  // kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7,
+  // They are designed to match AV1 convolution, which increases coeff
+  // values up to 7 bits. We could consider to combine them and use kFilterBits
+  // only.
+  kFilterBits = 7,
+  // Sub pixel is used in AV1 to represent a pixel location that is not at
+  // integer position. Sub pixel is in 1/16 (1 << kSubPixelBits) unit of
+  // integer pixel. Sub pixel values are interpolated using adjacent integer
+  // pixel values. The interpolation is a filtering process.
+  kSubPixelBits = 4,
+  kSubPixelMask = (1 << kSubPixelBits) - 1,
+  // Precision bits when computing inter prediction locations.
+  kScaleSubPixelBits = 10,
+  kWarpParamRoundingBits = 6,
+  // Number of fractional bits of lookup in divisor lookup table.
+  kDivisorLookupBits = 8,
+  // Number of fractional bits of entries in divisor lookup table.
+  kDivisorLookupPrecisionBits = 14,
+  // Number of phases used in warped filtering.
+  kWarpedPixelPrecisionShifts = 1 << 6,
+  kResidualPaddingVertical = 4,
+  kWedgeMaskMasterSize = 64,
+  kMaxFrameDistance = 31,
+  kReferenceFrameScalePrecision = 14,
+  kNumWienerCoefficients = 3,
+  kLoopFilterMaxModeDeltas = 2,
+  kMaxCdefStrengths = 8,
+  kCdefLargeValue = 0x4000,  // Used to indicate where CDEF is not available.
+  kMaxTileColumns = 64,
+  kMaxTileRows = 64,
+  kMaxOperatingPoints = 32,
+  // There can be a maximum of 4 spatial layers and 8 temporal layers.
+  kMaxLayers = 32,
+  // The cache line size should ideally be queried at run time. 64 is a common
+  // cache line size of x86 CPUs. Web searches showed the cache line size of ARM
+  // CPUs is 32 or 64 bytes. So aligning to 64-byte boundary will work for all
+  // CPUs that we care about, even though it is excessive for some ARM
+  // CPUs.
+  //
+  // On Linux, the cache line size can be looked up with the command:
+  //   getconf LEVEL1_DCACHE_LINESIZE
+  kCacheLineSize = 64,
+};  // anonymous enum
+
+enum FrameType : uint8_t {
+  kFrameKey,
+  kFrameInter,
+  kFrameIntraOnly,
+  kFrameSwitch
+};
+
+enum Plane : uint8_t { kPlaneY, kPlaneU, kPlaneV };
+enum : uint8_t { kMaxPlanesMonochrome = kPlaneY + 1, kMaxPlanes = kPlaneV + 1 };
+
+// The plane types, called luma and chroma in the spec.
+enum PlaneType : uint8_t { kPlaneTypeY, kPlaneTypeUV, kNumPlaneTypes };
+
+enum ReferenceFrameType : int8_t {
+  kReferenceFrameNone = -1,
+  kReferenceFrameIntra,
+  kReferenceFrameLast,
+  kReferenceFrameLast2,
+  kReferenceFrameLast3,
+  kReferenceFrameGolden,
+  kReferenceFrameBackward,
+  kReferenceFrameAlternate2,
+  kReferenceFrameAlternate,
+  kNumReferenceFrameTypes,
+  kNumInterReferenceFrameTypes =
+      EnumRangeLength(kReferenceFrameLast, kReferenceFrameAlternate),
+  kNumForwardReferenceTypes =
+      EnumRangeLength(kReferenceFrameLast, kReferenceFrameGolden),
+  kNumBackwardReferenceTypes =
+      EnumRangeLength(kReferenceFrameBackward, kReferenceFrameAlternate)
+};
+
+enum {
+  // Unidirectional compound reference pairs that are signaled explicitly:
+  // {kReferenceFrameLast, kReferenceFrameLast2},
+  // {kReferenceFrameLast, kReferenceFrameLast3},
+  // {kReferenceFrameLast, kReferenceFrameGolden},
+  // {kReferenceFrameBackward, kReferenceFrameAlternate}
+  kExplicitUnidirectionalCompoundReferences = 4,
+  // Other unidirectional compound reference pairs:
+  // {kReferenceFrameLast2, kReferenceFrameLast3},
+  // {kReferenceFrameLast2, kReferenceFrameGolden},
+  // {kReferenceFrameLast3, kReferenceFrameGolden},
+  // {kReferenceFrameBackward, kReferenceFrameAlternate2},
+  // {kReferenceFrameAlternate2, kReferenceFrameAlternate}
+  kUnidirectionalCompoundReferences =
+      kExplicitUnidirectionalCompoundReferences + 5,
+};  // anonymous enum
+
+enum BlockSize : uint8_t {
+  kBlock4x4,
+  kBlock4x8,
+  kBlock4x16,
+  kBlock8x4,
+  kBlock8x8,
+  kBlock8x16,
+  kBlock8x32,
+  kBlock16x4,
+  kBlock16x8,
+  kBlock16x16,
+  kBlock16x32,
+  kBlock16x64,
+  kBlock32x8,
+  kBlock32x16,
+  kBlock32x32,
+  kBlock32x64,
+  kBlock64x16,
+  kBlock64x32,
+  kBlock64x64,
+  kBlock64x128,
+  kBlock128x64,
+  kBlock128x128,
+  kMaxBlockSizes,
+  kBlockInvalid
+};
+
+//  Partition types.  R: Recursive
+//
+//  None          Horizontal    Vertical      Split
+//  +-------+     +-------+     +---+---+     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  |       |     +-------+     |   |   |     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  +-------+     +-------+     +---+---+     +---+---+
+//
+//  Horizontal    Horizontal    Vertical      Vertical
+//  with top      with bottom   with left     with right
+//  split         split         split         split
+//  +---+---+     +-------+     +---+---+     +---+---+
+//  |   |   |     |       |     |   |   |     |   |   |
+//  +---+---+     +---+---+     +---+   |     |   +---+
+//  |       |     |   |   |     |   |   |     |   |   |
+//  +-------+     +---+---+     +---+---+     +---+---+
+//
+//  Horizontal4   Vertical4
+//  +-----+       +-+-+-+
+//  +-----+       | | | |
+//  +-----+       | | | |
+//  +-----+       +-+-+-+
+enum Partition : uint8_t {
+  kPartitionNone,
+  kPartitionHorizontal,
+  kPartitionVertical,
+  kPartitionSplit,
+  kPartitionHorizontalWithTopSplit,
+  kPartitionHorizontalWithBottomSplit,
+  kPartitionVerticalWithLeftSplit,
+  kPartitionVerticalWithRightSplit,
+  kPartitionHorizontal4,
+  kPartitionVertical4
+};
+enum : uint8_t { kMaxPartitionTypes = kPartitionVertical4 + 1 };
+
+enum PredictionMode : uint8_t {
+  // Intra prediction modes.
+  kPredictionModeDc,
+  kPredictionModeVertical,
+  kPredictionModeHorizontal,
+  kPredictionModeD45,
+  kPredictionModeD135,
+  kPredictionModeD113,
+  kPredictionModeD157,
+  kPredictionModeD203,
+  kPredictionModeD67,
+  kPredictionModeSmooth,
+  kPredictionModeSmoothVertical,
+  kPredictionModeSmoothHorizontal,
+  kPredictionModePaeth,
+  kPredictionModeChromaFromLuma,
+  // Single inter prediction modes.
+  kPredictionModeNearestMv,
+  kPredictionModeNearMv,
+  kPredictionModeGlobalMv,
+  kPredictionModeNewMv,
+  // Compound inter prediction modes.
+  kPredictionModeNearestNearestMv,
+  kPredictionModeNearNearMv,
+  kPredictionModeNearestNewMv,
+  kPredictionModeNewNearestMv,
+  kPredictionModeNearNewMv,
+  kPredictionModeNewNearMv,
+  kPredictionModeGlobalGlobalMv,
+  kPredictionModeNewNewMv,
+  kNumPredictionModes,
+  kNumCompoundInterPredictionModes =
+      EnumRangeLength(kPredictionModeNearestNearestMv, kPredictionModeNewNewMv),
+  kIntraPredictionModesY =
+      EnumRangeLength(kPredictionModeDc, kPredictionModePaeth),
+  kIntraPredictionModesUV =
+      EnumRangeLength(kPredictionModeDc, kPredictionModeChromaFromLuma),
+  kPredictionModeInvalid = 255
+};
+
+enum InterIntraMode : uint8_t {
+  kInterIntraModeDc,
+  kInterIntraModeVertical,
+  kInterIntraModeHorizontal,
+  kInterIntraModeSmooth,
+  kNumInterIntraModes
+};
+
+enum MotionMode : uint8_t {
+  kMotionModeSimple,
+  kMotionModeObmc,  // Overlapped block motion compensation.
+  kMotionModeLocalWarp,
+  kNumMotionModes
+};
+
+enum TxMode : uint8_t {
+  kTxModeOnly4x4,
+  kTxModeLargest,
+  kTxModeSelect,
+  kNumTxModes
+};
+
+// These enums are named as kType1Type2 where Type1 is the transform type for
+// the rows and Type2 is the transform type for the columns.
+enum TransformType : uint8_t {
+  kTransformTypeDctDct,
+  kTransformTypeAdstDct,
+  kTransformTypeDctAdst,
+  kTransformTypeAdstAdst,
+  kTransformTypeFlipadstDct,
+  kTransformTypeDctFlipadst,
+  kTransformTypeFlipadstFlipadst,
+  kTransformTypeAdstFlipadst,
+  kTransformTypeFlipadstAdst,
+  kTransformTypeIdentityIdentity,
+  kTransformTypeIdentityDct,
+  kTransformTypeDctIdentity,
+  kTransformTypeIdentityAdst,
+  kTransformTypeAdstIdentity,
+  kTransformTypeIdentityFlipadst,
+  kTransformTypeFlipadstIdentity,
+  kNumTransformTypes
+};
+
+constexpr BitMaskSet kTransformFlipColumnsMask(kTransformTypeFlipadstDct,
+                                               kTransformTypeFlipadstAdst,
+                                               kTransformTypeFlipadstIdentity,
+                                               kTransformTypeFlipadstFlipadst);
+constexpr BitMaskSet kTransformFlipRowsMask(kTransformTypeDctFlipadst,
+                                            kTransformTypeAdstFlipadst,
+                                            kTransformTypeIdentityFlipadst,
+                                            kTransformTypeFlipadstFlipadst);
+
+enum TransformSize : uint8_t {
+  kTransformSize4x4,
+  kTransformSize4x8,
+  kTransformSize4x16,
+  kTransformSize8x4,
+  kTransformSize8x8,
+  kTransformSize8x16,
+  kTransformSize8x32,
+  kTransformSize16x4,
+  kTransformSize16x8,
+  kTransformSize16x16,
+  kTransformSize16x32,
+  kTransformSize16x64,
+  kTransformSize32x8,
+  kTransformSize32x16,
+  kTransformSize32x32,
+  kTransformSize32x64,
+  kTransformSize64x16,
+  kTransformSize64x32,
+  kTransformSize64x64,
+  kNumTransformSizes
+};
+
+enum TransformSet : uint8_t {
+  // DCT Only (1).
+  kTransformSetDctOnly,
+  // 2D-DCT and 2D-ADST without flip (4) + Identity (1) + 1D Horizontal/Vertical
+  // DCT (2) = Total (7).
+  kTransformSetIntra1,
+  // 2D-DCT and 2D-ADST without flip (4) + Identity (1) = Total (5).
+  kTransformSetIntra2,
+  // All transforms = Total (16).
+  kTransformSetInter1,
+  // 2D-DCT and 2D-ADST with flip (9) + Identity (1) + 1D Horizontal/Vertical
+  // DCT (2) = Total (12).
+  kTransformSetInter2,
+  // DCT (1) + Identity (1) = Total (2).
+  kTransformSetInter3,
+  kNumTransformSets
+};
+
+enum TransformClass : uint8_t {
+  kTransformClass2D,
+  kTransformClassHorizontal,
+  kTransformClassVertical,
+  kNumTransformClasses
+};
+
+enum FilterIntraPredictor : uint8_t {
+  kFilterIntraPredictorDc,
+  kFilterIntraPredictorVertical,
+  kFilterIntraPredictorHorizontal,
+  kFilterIntraPredictorD157,
+  kFilterIntraPredictorPaeth,
+  kNumFilterIntraPredictors
+};
+
+enum ObmcDirection : uint8_t {
+  kObmcDirectionVertical,
+  kObmcDirectionHorizontal,
+  kNumObmcDirections
+};
+
+// In AV1 the name of the filter refers to the direction of filter application.
+// Horizontal refers to the column edge and vertical the row edge.
+enum LoopFilterType : uint8_t {
+  kLoopFilterTypeVertical,
+  kLoopFilterTypeHorizontal,
+  kNumLoopFilterTypes
+};
+
+enum LoopFilterTransformSizeId : uint8_t {
+  kLoopFilterTransformSizeId4x4,
+  kLoopFilterTransformSizeId8x8,
+  kLoopFilterTransformSizeId16x16,
+  kNumLoopFilterTransformSizeIds
+};
+
+enum LoopRestorationType : uint8_t {
+  kLoopRestorationTypeNone,
+  kLoopRestorationTypeSwitchable,
+  kLoopRestorationTypeWiener,
+  kLoopRestorationTypeSgrProj,  // self guided projection filter.
+  kNumLoopRestorationTypes
+};
+
+enum CompoundReferenceType : uint8_t {
+  kCompoundReferenceUnidirectional,
+  kCompoundReferenceBidirectional,
+  kNumCompoundReferenceTypes
+};
+
+enum CompoundPredictionType : uint8_t {
+  kCompoundPredictionTypeWedge,
+  kCompoundPredictionTypeDiffWeighted,
+  kCompoundPredictionTypeAverage,
+  kCompoundPredictionTypeIntra,
+  kCompoundPredictionTypeDistance,
+  kNumCompoundPredictionTypes,
+  // Number of compound prediction types that are explicitly signaled in the
+  // bitstream (in the compound_type syntax element).
+  kNumExplicitCompoundPredictionTypes = 2
+};
+
+enum InterpolationFilter : uint8_t {
+  kInterpolationFilterEightTap,
+  kInterpolationFilterEightTapSmooth,
+  kInterpolationFilterEightTapSharp,
+  kInterpolationFilterBilinear,
+  kInterpolationFilterSwitchable,
+  kNumInterpolationFilters,
+  // Number of interpolation filters that can be explicitly signaled in the
+  // compressed headers (when the uncompressed headers allow switchable
+  // interpolation filters) of the bitstream.
+  kNumExplicitInterpolationFilters = EnumRangeLength(
+      kInterpolationFilterEightTap, kInterpolationFilterEightTapSharp)
+};
+
+enum MvJointType : uint8_t {
+  kMvJointTypeZero,
+  kMvJointTypeHorizontalNonZeroVerticalZero,
+  kMvJointTypeHorizontalZeroVerticalNonZero,
+  kMvJointTypeNonZero,
+  kNumMvJointTypes
+};
+
+enum ObuType : int8_t {
+  kObuInvalid = -1,
+  kObuSequenceHeader = 1,
+  kObuTemporalDelimiter = 2,
+  kObuFrameHeader = 3,
+  kObuTileGroup = 4,
+  kObuMetadata = 5,
+  kObuFrame = 6,
+  kObuRedundantFrameHeader = 7,
+  kObuTileList = 8,
+  kObuPadding = 15,
+};
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const BlockSize size) {
+  switch (size) {
+    case kBlock4x4:
+      return "kBlock4x4";
+    case kBlock4x8:
+      return "kBlock4x8";
+    case kBlock4x16:
+      return "kBlock4x16";
+    case kBlock8x4:
+      return "kBlock8x4";
+    case kBlock8x8:
+      return "kBlock8x8";
+    case kBlock8x16:
+      return "kBlock8x16";
+    case kBlock8x32:
+      return "kBlock8x32";
+    case kBlock16x4:
+      return "kBlock16x4";
+    case kBlock16x8:
+      return "kBlock16x8";
+    case kBlock16x16:
+      return "kBlock16x16";
+    case kBlock16x32:
+      return "kBlock16x32";
+    case kBlock16x64:
+      return "kBlock16x64";
+    case kBlock32x8:
+      return "kBlock32x8";
+    case kBlock32x16:
+      return "kBlock32x16";
+    case kBlock32x32:
+      return "kBlock32x32";
+    case kBlock32x64:
+      return "kBlock32x64";
+    case kBlock64x16:
+      return "kBlock64x16";
+    case kBlock64x32:
+      return "kBlock64x32";
+    case kBlock64x64:
+      return "kBlock64x64";
+    case kBlock64x128:
+      return "kBlock64x128";
+    case kBlock128x64:
+      return "kBlock128x64";
+    case kBlock128x128:
+      return "kBlock128x128";
+    case kMaxBlockSizes:
+      return "kMaxBlockSizes";
+    case kBlockInvalid:
+      return "kBlockInvalid";
+  }
+  abort();
+}
+
+inline const char* ToString(const InterIntraMode mode) {
+  switch (mode) {
+    case kInterIntraModeDc:
+      return "kInterIntraModeDc";
+    case kInterIntraModeVertical:
+      return "kInterIntraModeVertical";
+    case kInterIntraModeHorizontal:
+      return "kInterIntraModeHorizontal";
+    case kInterIntraModeSmooth:
+      return "kInterIntraModeSmooth";
+    case kNumInterIntraModes:
+      return "kNumInterIntraModes";
+  }
+  abort();
+}
+
+inline const char* ToString(const ObmcDirection direction) {
+  switch (direction) {
+    case kObmcDirectionVertical:
+      return "kObmcDirectionVertical";
+    case kObmcDirectionHorizontal:
+      return "kObmcDirectionHorizontal";
+    case kNumObmcDirections:
+      return "kNumObmcDirections";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopRestorationType type) {
+  switch (type) {
+    case kLoopRestorationTypeNone:
+      return "kLoopRestorationTypeNone";
+    case kLoopRestorationTypeSwitchable:
+      return "kLoopRestorationTypeSwitchable";
+    case kLoopRestorationTypeWiener:
+      return "kLoopRestorationTypeWiener";
+    case kLoopRestorationTypeSgrProj:
+      return "kLoopRestorationTypeSgrProj";
+    case kNumLoopRestorationTypes:
+      return "kNumLoopRestorationTypes";
+  }
+  abort();
+}
+
+inline const char* ToString(const TransformType type) {
+  switch (type) {
+    case kTransformTypeDctDct:
+      return "kTransformTypeDctDct";
+    case kTransformTypeAdstDct:
+      return "kTransformTypeAdstDct";
+    case kTransformTypeDctAdst:
+      return "kTransformTypeDctAdst";
+    case kTransformTypeAdstAdst:
+      return "kTransformTypeAdstAdst";
+    case kTransformTypeFlipadstDct:
+      return "kTransformTypeFlipadstDct";
+    case kTransformTypeDctFlipadst:
+      return "kTransformTypeDctFlipadst";
+    case kTransformTypeFlipadstFlipadst:
+      return "kTransformTypeFlipadstFlipadst";
+    case kTransformTypeAdstFlipadst:
+      return "kTransformTypeAdstFlipadst";
+    case kTransformTypeFlipadstAdst:
+      return "kTransformTypeFlipadstAdst";
+    case kTransformTypeIdentityIdentity:
+      return "kTransformTypeIdentityIdentity";
+    case kTransformTypeIdentityDct:
+      return "kTransformTypeIdentityDct";
+    case kTransformTypeDctIdentity:
+      return "kTransformTypeDctIdentity";
+    case kTransformTypeIdentityAdst:
+      return "kTransformTypeIdentityAdst";
+    case kTransformTypeAdstIdentity:
+      return "kTransformTypeAdstIdentity";
+    case kTransformTypeIdentityFlipadst:
+      return "kTransformTypeIdentityFlipadst";
+    case kTransformTypeFlipadstIdentity:
+      return "kTransformTypeFlipadstIdentity";
+    // case to quiet compiler
+    case kNumTransformTypes:
+      return "kNumTransformTypes";
+  }
+  abort();
+}
+
+//------------------------------------------------------------------------------
+
+extern const uint8_t k4x4WidthLog2[kMaxBlockSizes];
+
+extern const uint8_t k4x4HeightLog2[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksWide[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes];
+
+extern const uint8_t kBlockWidthPixels[kMaxBlockSizes];
+
+extern const uint8_t kBlockHeightPixels[kMaxBlockSizes];
+
+extern const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes];
+
+extern const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2];
+
+extern const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1];
+
+extern const uint8_t kTransformWidth[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight[kNumTransformSizes];
+
+extern const uint8_t kTransformWidth4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformWidthLog2[kNumTransformSizes];
+
+extern const uint8_t kTransformHeightLog2[kNumTransformSizes];
+
+extern const TransformSize kSplitTransformSize[kNumTransformSizes];
+
+// Square transform of size min(w,h).
+extern const TransformSize kTransformSizeSquareMin[kNumTransformSizes];
+
+// Square transform of size max(w,h).
+extern const TransformSize kTransformSizeSquareMax[kNumTransformSizes];
+
+extern const uint8_t kNumTransformTypesInSet[kNumTransformSets];
+
+extern const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4];
+
+extern const int8_t kSgrProjMultiplierMin[2];
+
+extern const int8_t kSgrProjMultiplierMax[2];
+
+extern const int8_t kWienerTapsMin[3];
+
+extern const int8_t kWienerTapsMax[3];
+
+extern const uint8_t kUpscaleFilterUnsigned[kSuperResFilterShifts]
+                                           [kSuperResFilterTaps];
+
+// An int8_t version of the kWarpedFilters array.
+// Note: The array could be removed with a performance penalty.
+extern const int8_t kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int16_t kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int8_t kHalfSubPixelFilters[6][16][8];
+
+extern const uint8_t kAbsHalfSubPixelFilters[6][16][8];
+
+extern const int16_t kDirectionalIntraPredictorDerivative[44];
+
+extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_CONSTANTS_H_
diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc
new file mode 100644
index 0000000..a6b7057
--- /dev/null
+++ b/src/utils/cpu.cc
@@ -0,0 +1,84 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/cpu.h"
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#include <cpuid.h>
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include <immintrin.h>  // _xgetbv
+#include <intrin.h>
+#endif
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+namespace {
+
+#if defined(__GNUC__)
+void CpuId(int leaf, uint32_t info[4]) {
+  __cpuid_count(leaf, 0 /*ecx=subleaf*/, info[0], info[1], info[2], info[3]);
+}
+
+uint64_t Xgetbv() {
+  const uint32_t ecx = 0;  // ecx specifies the extended control register
+  uint32_t eax;
+  uint32_t edx;
+  __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
+  return (static_cast<uint64_t>(edx) << 32) | eax;
+}
+#else  // _MSC_VER
+void CpuId(int leaf, uint32_t info[4]) {
+  __cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
+}
+
+uint64_t Xgetbv() { return _xgetbv(0); }
+#endif  // __GNUC__
+
+}  // namespace
+
+uint32_t GetCpuInfo() {
+  uint32_t info[4];
+
+  // Get the highest feature value cpuid supports
+  CpuId(0, info);
+  const int max_cpuid_value = info[0];
+  if (max_cpuid_value < 1) return 0;
+
+  CpuId(1, info);
+  uint32_t features = 0;
+  if ((info[3] & (1 << 26)) != 0) features |= kSSE2;
+  if ((info[2] & (1 << 9)) != 0) features |= kSSSE3;
+  if ((info[2] & (1 << 19)) != 0) features |= kSSE4_1;
+
+  // Bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((info[2] & (3 << 27)) == (3 << 27)) {
+    // XMM state and YMM state enabled by the OS
+    if ((Xgetbv() & 0x6) == 0x6) {
+      features |= kAVX;
+      if (max_cpuid_value >= 7) {
+        CpuId(7, info);
+        if ((info[1] & (1 << 5)) != 0) features |= kAVX2;
+      }
+    }
+  }
+
+  return features;
+}
+#else
+uint32_t GetCpuInfo() { return 0; }
+#endif  // x86 || x86_64
+
+}  // namespace libgav1
diff --git a/src/utils/cpu.h b/src/utils/cpu.h
new file mode 100644
index 0000000..630b251
--- /dev/null
+++ b/src/utils/cpu.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CPU_H_
+#define LIBGAV1_SRC_UTILS_CPU_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__)
+#define LIBGAV1_X86
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#define LIBGAV1_X86
+#define LIBGAV1_X86_MSVC
+#endif
+
+#if defined(LIBGAV1_X86)
+
+#if !defined(LIBGAV1_ENABLE_SSE4_1)
+#define LIBGAV1_ENABLE_SSE4_1 1
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+#if !defined(LIBGAV1_ENABLE_AVX2)
+#define LIBGAV1_ENABLE_AVX2 1
+#endif  // !defined(LIBGAV1_ENABLE_AVX2)
+#else  // !LIBGAV1_ENABLE_SSE4_1
+// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#else  // !LIBGAV1_X86
+
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#undef LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_ENABLE_SSE4_1 0
+
+#endif  // LIBGAV1_X86
+
+// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting
+// (at least) that instruction set. This prevents disabling other instruction
+// sets if the current instruction set isn't a global target, e.g., building
+// *_avx2.cc w/-mavx2, but the remaining files without the flag.
+#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__)
+#define LIBGAV1_TARGETING_AVX2 1
+#else
+#define LIBGAV1_TARGETING_AVX2 0
+#endif
+
+// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there
+// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be
+// enabled in dsp.h to compensate for this.
+#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC))
+#define LIBGAV1_TARGETING_SSE4_1 1
+#else
+#define LIBGAV1_TARGETING_SSE4_1 0
+#endif
+
+#undef LIBGAV1_X86
+
+#if !defined(LIBGAV1_ENABLE_NEON)
+// TODO(jzern): add support for _M_ARM64.
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+    (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENABLE_NEON 0
+#endif
+#endif  // !defined(LIBGAV1_ENABLE_NEON)
+
+enum CpuFeatures : uint8_t {
+  kSSE2 = 1 << 0,
+#define LIBGAV1_CPU_SSE2 (1 << 0)
+  kSSSE3 = 1 << 1,
+#define LIBGAV1_CPU_SSSE3 (1 << 1)
+  kSSE4_1 = 1 << 2,
+#define LIBGAV1_CPU_SSE4_1 (1 << 2)
+  kAVX = 1 << 3,
+#define LIBGAV1_CPU_AVX (1 << 3)
+  kAVX2 = 1 << 4,
+#define LIBGAV1_CPU_AVX2 (1 << 4)
+  kNEON = 1 << 5,
+#define LIBGAV1_CPU_NEON (1 << 5)
+};
+
+// Returns a bit-wise OR of CpuFeatures supported by this platform.
+uint32_t GetCpuInfo();
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_CPU_H_
diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h
new file mode 100644
index 0000000..b51345a
--- /dev/null
+++ b/src/utils/dynamic_buffer.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+#define LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+
+#include <memory>
+#include <new>
+
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+template <typename T>
+class DynamicBuffer {
+ public:
+  T* get() { return buffer_.get(); }
+  const T* get() const { return buffer_.get(); }
+
+  // Resizes the buffer so that it can hold at least |size| elements. Existing
+  // contents will be destroyed when resizing to a larger size.
+  //
+  // Returns true on success. If Resize() returns false, then subsequent calls
+  // to get() will return nullptr.
+  bool Resize(size_t size) {
+    if (size <= size_) return true;
+    buffer_.reset(new (std::nothrow) T[size]);
+    if (buffer_ == nullptr) {
+      size_ = 0;
+      return false;
+    }
+    size_ = size;
+    return true;
+  }
+
+ private:
+  std::unique_ptr<T[]> buffer_;
+  size_t size_ = 0;
+};
+
+template <typename T, int alignment>
+class AlignedDynamicBuffer {
+ public:
+  T* get() { return buffer_.get(); }
+
+  // Resizes the buffer so that it can hold at least |size| elements. Existing
+  // contents will be destroyed when resizing to a larger size.
+  //
+  // Returns true on success. If Resize() returns false, then subsequent calls
+  // to get() will return nullptr.
+  bool Resize(size_t size) {
+    if (size <= size_) return true;
+    buffer_ = MakeAlignedUniquePtr<T>(alignment, size);
+    if (buffer_ == nullptr) {
+      size_ = 0;
+      return false;
+    }
+    size_ = size;
+    return true;
+  }
+
+ private:
+  AlignedUniquePtr<T> buffer_;
+  size_t size_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
diff --git a/src/utils/entropy_decoder.cc b/src/utils/entropy_decoder.cc
new file mode 100644
index 0000000..bf21199
--- /dev/null
+++ b/src/utils/entropy_decoder.cc
@@ -0,0 +1,1117 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+    (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#include <emmintrin.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr uint32_t kReadBitMask = ~255;
+constexpr int kCdfPrecision = 6;
+constexpr int kMinimumProbabilityPerSymbol = 4;
+
+// This function computes the "cur" variable as specified inside the do-while
+// loop in Section 8.2.6 of the spec. This function is monotonically
+// decreasing as the values of index increases (note that the |cdf| array is
+// sorted in decreasing order).
+uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
+                  int index, int symbol_count) {
+  return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
+         (kMinimumProbabilityPerSymbol * (symbol_count - index));
+}
+
+void UpdateCdf(uint16_t* const cdf, const int symbol_count, const int symbol) {
+  const uint16_t count = cdf[symbol_count];
+  // rate is computed in the spec as:
+  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+  // In this case cdf[N] is |count|.
+  // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
+  // symbol_count > 3. So the equation becomes:
+  //  4 + (count > 15) + (count > 31) + (symbol_count > 3).
+  // Note that the largest value for count is 32 (it is not incremented beyond
+  // 32). So using that information:
+  //  count >> 4 is 0 for count from 0 to 15.
+  //  count >> 4 is 1 for count from 16 to 31.
+  //  count >> 4 is 2 for count == 31.
+  // Now, the equation becomes:
+  //  4 + (count >> 4) + (symbol_count > 3).
+  // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
+  // with bitwise or:
+  //  (4 | (count >> 4)) + (symbol_count > 3).
+  // but using addition will allow the compiler to eliminate an operation when
+  // symbol_count is known and this function is inlined.
+  const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
+  // Hints for further optimizations:
+  //
+  // 1. clang can vectorize this for loop with width 4, even though the loop
+  // contains an if-else statement. Therefore, it may be advantageous to use
+  // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
+  // (a multiple of 4 that's not too small).
+  //
+  // 2. The for loop can be rewritten in the following form, which would enable
+  // clang to vectorize the loop with width 8:
+  //
+  //   const int rounding = (1 << rate) - 1;
+  //   for (int i = 0; i < symbol_count - 1; ++i) {
+  //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+  //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+  //   }
+  //
+  // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
+  // integer arithmetic. The result of the unsigned subtraction is cast to a
+  // signed integer and right-shifted. This requires the right shift of a
+  // signed integer be an arithmetic shift, which is true for clang, gcc, and
+  // Visual C++.
+  assert(symbol_count - 1 > 0);
+  int i = 0;
+  do {
+    if (i < symbol) {
+      cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+    } else {
+      cdf[i] -= cdf[i] >> rate;
+    }
+  } while (++i < symbol_count - 1);
+  cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+}
+
+// Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
+// of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
+// SIMD instruction sets if available.
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+// The UpdateCdf() method contains the following for loop:
+//
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     if (i < symbol) {
+//       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+//     } else {
+//       cdf[i] -= cdf[i] >> rate;
+//     }
+//   }
+//
+// It can be rewritten in the following two forms, which are amenable to SIMD
+// implementations:
+//
+//   const int rounding = (1 << rate) - 1;
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+//     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+//   }
+//
+// or:
+//
+//   const int rounding = (1 << rate) - 1;
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
+//     cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
+//   }
+//
+// The following ARM NEON implementations use a modified version of the first
+// form, using the comparison mask and unsigned rollover to avoid the need to
+// calculate rounding.
+//
+// The cdf array has symbol_count + 1 elements. The first symbol_count elements
+// are the CDF. The last element is a count that is initialized to 0 and may
+// grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
+// cdf[symbol_count - 1] is always 0, the for loop does not update
+// cdf[symbol_count - 1]. However, it would be correct to have the for loop
+// update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
+// for loop would take the else branch when i is symbol_count - 1:
+//      cdf[i] -= cdf[i] >> rate;
+// Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
+// after the update. The ARM NEON implementations take advantage of this in the
+// following two cases:
+// 1. When symbol_count is 8 or 16, the vectorized code updates the first
+//    symbol_count elements in the array.
+// 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
+//    the cdf array. Since an invalid CDF value is written into cdf[7], the
+//    count in cdf[7] needs to be fixed up after the vectorized code.
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+  uint16x4_t cdf_vec = vld1_u16(cdf);
+  const uint16_t count = cdf[5];
+  const int rate = (count >> 4) + 5;
+  const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+  const uint16x4_t index = vcreate_u16(0x0003000200010000);
+  const uint16x4_t symbol_vec = vdup_n_u16(symbol);
+  const uint16x4_t mask = vcge_u16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
+  const int16x4_t negative_rate = vdup_n_s16(-rate);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+  const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = vadd_u16(cdf_offset, delta);
+  vst1_u16(cdf, cdf_vec);
+  cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
+  static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+  uint16x8_t cdf_vec = vld1q_u16(cdf);
+  const uint16_t count = cdf[symbol_count];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                        vcreate_u16(0x0007000600050004));
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+  const uint16x8_t delta =
+      vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec);
+  cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+  uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
+  const uint16_t count = cdf[11];
+  cdf[11] = count + static_cast<uint16_t>(count < 32);
+  const int rate = (count >> 4) + 5;
+  if (symbol > 1) {
+    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+    const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+    const int16x8_t negative_rate = vdupq_n_s16(-rate);
+    const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
+                                          vcreate_u16(0x0009000800070006));
+    const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+    const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+    const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+    const uint16x8_t delta =
+        vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+    cdf_vec = vaddq_u16(cdf_offset, delta);
+    vst1q_u16(cdf + 2, cdf_vec);
+  } else {
+    if (symbol != 0) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    }
+    const int16x8_t negative_rate = vdupq_n_s16(-rate);
+    const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
+    cdf_vec = vsubq_u16(cdf_vec, delta);
+    vst1q_u16(cdf + 2, cdf_vec);
+  }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+  uint16x8_t cdf_vec0 = vld1q_u16(cdf);
+  uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
+  const uint16_t count = cdf[13];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+  uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                  vcreate_u16(0x0007000600050004));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
+  uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec0 = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec0);
+
+  index = vcombine_u16(vcreate_u16(0x0007000600050004),
+                       vcreate_u16(0x000b000a00090008));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
+  cdf_offset = vsubq_u16(cdf_vec1, mask);
+  delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec1 = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf + 4, cdf_vec1);
+
+  cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+  uint16x8_t cdf_vec = vld1q_u16(cdf);
+  const uint16_t count = cdf[16];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+  uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                  vcreate_u16(0x0007000600050004));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+  uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec);
+
+  cdf_vec = vld1q_u16(cdf + 8);
+  index = vcombine_u16(vcreate_u16(0x000b000a00090008),
+                       vcreate_u16(0x000f000e000d000c));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  cdf_offset = vsubq_u16(cdf_vec, mask);
+  delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf + 8, cdf_vec);
+
+  cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+  __m128i cdf_vec = LoadLo8(cdf);
+  const uint16_t count = cdf[5];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
+  // i >= symbol.
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec);
+  cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
+  static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+  __m128i cdf_vec = LoadUnaligned16(cdf);
+  const uint16_t count = cdf[symbol_count];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i index =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
+  StoreUnaligned16(cdf, cdf_vec);
+  cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+  __m128i cdf_vec = LoadUnaligned16(cdf + 2);
+  const uint16_t count = cdf[11];
+  cdf[11] = count + static_cast<uint16_t>(count < 32);
+  const int rate = (count >> 4) + 5;
+  if (symbol > 1) {
+    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    const __m128i cdf_max_probability =
+        _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+    const __m128i index =
+        _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
+    const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+    const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+    const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+    const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+    const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+    const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+    cdf_vec = _mm_add_epi16(cdf_offset, delta);
+    StoreUnaligned16(cdf + 2, cdf_vec);
+  } else {
+    if (symbol != 0) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    }
+    const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+    cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+    StoreUnaligned16(cdf + 2, cdf_vec);
+  }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+  __m128i cdf_vec0 = LoadLo8(cdf);
+  __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
+  const uint16_t count = cdf[13];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec0);
+
+  const __m128i index1 =
+      _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+  const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+  StoreUnaligned16(cdf + 4, cdf_vec1);
+
+  cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+  __m128i cdf_vec0 = LoadUnaligned16(cdf);
+  const uint16_t count = cdf[16];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+  const __m128i index =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreUnaligned16(cdf, cdf_vec0);
+
+  __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
+  const __m128i index1 =
+      _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+  const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+  StoreUnaligned16(cdf + 8, cdf_vec1);
+
+  cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 5, symbol);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 7, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 8, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 9, symbol);
+}
+
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 11, symbol);
+}
+
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 13, symbol);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 16, symbol);
+}
+
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+inline DaalaBitReader::WindowSize HostToBigEndian(
+    const DaalaBitReader::WindowSize x) {
+  static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
+#if defined(__GNUC__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
+#else
+  return x;
+#endif
+#elif defined(_WIN32)
+  // Note Windows targets are assumed to be little endian.
+  return static_cast<DaalaBitReader::WindowSize>(
+      (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
+                       : _byteswap_ulong(static_cast<unsigned long>(x)));
+#else
+#error Unknown compiler!
+#endif  // defined(__GNUC__)
+}
+
+}  // namespace
+
+#if !LIBGAV1_CXX17
+constexpr int DaalaBitReader::kWindowSize;  // static.
+#endif
+
+DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size,
+                               bool allow_update_cdf)
+    : data_(data),
+      data_end_(data + size),
+      data_memcpy_end_((size >= sizeof(WindowSize))
+                           ? data + size - sizeof(WindowSize) + 1
+                           : data),
+      allow_update_cdf_(allow_update_cdf),
+      values_in_range_(kCdfMaxProbability) {
+  if (data_ < data_memcpy_end_) {
+    // This is a simplified version of PopulateBits() which loads 8 extra bits
+    // and skips the unnecessary shifts of value and window_diff_.
+    WindowSize value;
+    memcpy(&value, data_, sizeof(value));
+    data_ += sizeof(value);
+    window_diff_ = HostToBigEndian(value) ^ -1;
+    // Note the initial value of bits_ is larger than kMaxCachedBits as it's
+    // used to restore the most significant 0 bit that would be present after
+    // PopulateBits() when we extract the first symbol value.
+    // As shown in Section 8.2.2 Initialization process for symbol decoder,
+    // which uses a fixed offset to read the symbol values, the most
+    // significant bit is always 0:
+    //   The variable numBits is set equal to Min( sz * 8, 15).
+    //   The variable buf is read using the f(numBits) parsing process.
+    //   The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
+    //   The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
+    bits_ = kWindowSize - 15;
+    return;
+  }
+  window_diff_ = 0;
+  bits_ = -15;
+  PopulateBits();
+}
+
+// This is similar to the ReadSymbol() implementation but it is optimized based
+// on the following facts:
+//   * The probability is fixed at half. So some multiplications can be replaced
+//     with bit operations.
+//   * Symbol count is fixed at 2.
+int DaalaBitReader::ReadBit() {
+  const uint32_t curr =
+      ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  int bit = 1;
+  if (symbol_value >= curr) {
+    values_in_range_ -= curr;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+    bit = 0;
+  } else {
+    values_in_range_ = curr;
+  }
+  NormalizeRange();
+  return bit;
+}
+
+int64_t DaalaBitReader::ReadLiteral(int num_bits) {
+  assert(num_bits <= 32);
+  assert(num_bits > 0);
+  uint32_t literal = 0;
+  int bit = num_bits - 1;
+  do {
+    // ARM can combine a shift operation with a constant number of bits with
+    // some other operations, such as the OR operation.
+    // Here is an ARM disassembly example:
+    // orr w1, w0, w1, lsl #1
+    // which left shifts register w1 by 1 bit and OR the shift result with
+    // register w0.
+    // The next 2 lines are equivalent to:
+    // literal |= static_cast<uint32_t>(ReadBit()) << bit;
+    literal <<= 1;
+    literal |= static_cast<uint32_t>(ReadBit());
+  } while (--bit >= 0);
+  return literal;
+}
+
+int DaalaBitReader::ReadSymbol(uint16_t* const cdf, int symbol_count) {
+  const int symbol = ReadSymbolImpl(cdf, symbol_count);
+  if (allow_update_cdf_) {
+    UpdateCdf(cdf, symbol_count, symbol);
+  }
+  return symbol;
+}
+
+bool DaalaBitReader::ReadSymbol(uint16_t* cdf) {
+  assert(cdf[1] == 0);
+  const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
+  if (allow_update_cdf_) {
+    const uint16_t count = cdf[2];
+    // rate is computed in the spec as:
+    //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+    // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
+    //  4 + (count > 15) + (count > 31)
+    // Note that the largest value for count is 32 (it is not incremented beyond
+    // 32). So using that information:
+    //  count >> 4 is 0 for count from 0 to 15.
+    //  count >> 4 is 1 for count from 16 to 31.
+    //  count >> 4 is 2 for count == 32.
+    // Now, the equation becomes:
+    //  4 + (count >> 4).
+    // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
+    // with bitwise or. So the final equation is:
+    //  4 | (count >> 4).
+    const int rate = 4 | (count >> 4);
+    if (symbol) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+    }
+    cdf[2] += static_cast<uint16_t>(count < 32);
+  }
+  return symbol;
+}
+
+bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
+  return ReadSymbolImpl(cdf) != 0;
+}
+
+template <int symbol_count>
+int DaalaBitReader::ReadSymbol(uint16_t* const cdf) {
+  static_assert(symbol_count >= 3 && symbol_count <= 16, "");
+  if (symbol_count == 3 || symbol_count == 4) {
+    return ReadSymbol3Or4(cdf, symbol_count);
+  }
+  int symbol;
+  if (symbol_count == 8) {
+    symbol = ReadSymbolImpl8(cdf);
+  } else if (symbol_count <= 13) {
+    symbol = ReadSymbolImpl(cdf, symbol_count);
+  } else {
+    symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
+  }
+  if (allow_update_cdf_) {
+    if (symbol_count == 5) {
+      UpdateCdf5(cdf, symbol);
+    } else if (symbol_count == 7) {
+      UpdateCdf7(cdf, symbol);
+    } else if (symbol_count == 8) {
+      UpdateCdf8(cdf, symbol);
+    } else if (symbol_count == 9) {
+      UpdateCdf9(cdf, symbol);
+    } else if (symbol_count == 11) {
+      UpdateCdf11(cdf, symbol);
+    } else if (symbol_count == 13) {
+      UpdateCdf13(cdf, symbol);
+    } else if (symbol_count == 16) {
+      UpdateCdf16(cdf, symbol);
+    } else {
+      UpdateCdf(cdf, symbol_count, symbol);
+    }
+  }
+  return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf,
+                                   int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  --symbol_count;
+  uint32_t curr = values_in_range_;
+  int symbol = -1;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over.
+  do {
+    prev = curr;
+    curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    delta -= kMinimumProbabilityPerSymbol;
+  } while (symbol_value < curr);
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImplBinarySearch(const uint16_t* const cdf,
+                                               int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  assert(symbol_count > 1 && symbol_count <= 16);
+  --symbol_count;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
+  // search to do this. Let |symbol| be the index of the first |cdf| array
+  // entry whose scaled cdf value is less than or equal to |symbol_value|. The
+  // binary search maintains the invariant:
+  //   low <= symbol <= high + 1
+  // and terminates when low == high + 1.
+  int low = 0;
+  int high = symbol_count - 1;
+  // The binary search maintains the invariants that |prev| is the scaled cdf
+  // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
+  // convention, the scaled cdf value for -1 is values_in_range_.) When the
+  // binary search terminates, |prev| is the scaled cdf value for symbol - 1
+  // and |curr| is the scaled cdf value for |symbol|.
+  uint32_t prev = values_in_range_;
+  uint32_t curr = 0;
+  const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+  do {
+    const int mid = DivideBy2(low + high);
+    const uint32_t scaled_cdf =
+        ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
+    if (symbol_value < scaled_cdf) {
+      low = mid + 1;
+      prev = scaled_cdf;
+    } else {
+      high = mid - 1;
+      curr = scaled_cdf;
+    }
+  } while (low <= high);
+  assert(low == high + 1);
+  // At this point, |low| is the symbol that has been decoded.
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return low;
+}
+
+int DaalaBitReader::ReadSymbolImpl(uint16_t cdf) {
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  const uint32_t curr =
+      (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
+      kMinimumProbabilityPerSymbol;
+  const int symbol = static_cast<int>(symbol_value < curr);
+  if (symbol == 1) {
+    values_in_range_ = curr;
+  } else {
+    values_in_range_ -= curr;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  }
+  NormalizeRange();
+  return symbol;
+}
+
+// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
+// calls inlined.
+int DaalaBitReader::ReadSymbol3Or4(uint16_t* const cdf,
+                                   const int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  uint32_t curr = values_in_range_;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
+  const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
+  // array.
+  //
+  // The original code is:
+  //
+  //  int symbol = -1;
+  //  do {
+  //    prev = curr;
+  //    curr =
+  //        ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+  //        + delta;
+  //    delta -= kMinimumProbabilityPerSymbol;
+  //  } while (symbol_value < curr);
+  //  if (allow_update_cdf_) {
+  //    UpdateCdf(cdf, [3,4], symbol);
+  //  }
+  //
+  // The do-while loop is unrolled with three or four iterations, and the
+  // UpdateCdf call is inlined and merged into the iterations.
+  int symbol = 0;
+  // Iteration 0.
+  prev = curr;
+  curr =
+      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) {
+    // symbol == 0.
+    if (allow_update_cdf_) {
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+        // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
+        // NEON code is slower. Consider using the C version if __arm__ is
+        // defined.
+        // 2. The ARM NEON code (compiled for arm64) is slightly slower on
+        // Samsung Galaxy S8+ (SM-G955FD).
+        uint16x4_t cdf_vec = vld1_u16(cdf);
+        const int16x4_t negative_rate = vdup_n_s16(-rate);
+        const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
+        cdf_vec = vsub_u16(cdf_vec, delta);
+        vst1_u16(cdf, cdf_vec);
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        __m128i cdf_vec = LoadLo8(cdf);
+        const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+        cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+        StoreLo8(cdf, cdf_vec);
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+        cdf[2] -= cdf[2] >> rate;
+#endif
+      } else {  // symbol_count == 3.
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+      }
+    }
+    goto found;
+  }
+  ++symbol;
+  delta -= kMinimumProbabilityPerSymbol;
+  // Iteration 1.
+  prev = curr;
+  curr =
+      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) {
+    // symbol == 1.
+    if (allow_update_cdf_) {
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+      if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
+    }
+    goto found;
+  }
+  ++symbol;
+  if (symbol_count == 4) {
+    delta -= kMinimumProbabilityPerSymbol;
+    // Iteration 2.
+    prev = curr;
+    curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    if (symbol_value >= curr) {
+      // symbol == 2.
+      if (allow_update_cdf_) {
+        // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
+        const uint16_t count = cdf[4];
+        cdf[4] += static_cast<uint16_t>(count < 32);
+        const int rate = (count >> 4) + 5;
+        cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+        cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+        cdf[2] -= cdf[2] >> rate;
+      }
+      goto found;
+    }
+    ++symbol;
+  }
+  // |delta| is 0 for the last iteration.
+  // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
+  prev = curr;
+  // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == [2,3].
+  if (allow_update_cdf_) {
+    // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
+    const uint16_t count = cdf[symbol_count];
+    cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+    const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
+    if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+      // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
+      // code is a tiny bit slower. Consider using the C version if __arm__ is
+      // defined.
+      uint16x4_t cdf_vec = vld1_u16(cdf);
+      const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+      const int16x4_t diff =
+          vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
+      const int16x4_t negative_rate = vdup_n_s16(-rate);
+      const uint16x4_t delta =
+          vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+      cdf_vec = vadd_u16(cdf_vec, delta);
+      vst1_u16(cdf, cdf_vec);
+      cdf[3] = 0;
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      __m128i cdf_vec = LoadLo8(cdf);
+      const __m128i cdf_max_probability =
+          _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+      const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
+      const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+      cdf_vec = _mm_add_epi16(cdf_vec, delta);
+      StoreLo8(cdf, cdf_vec);
+      cdf[3] = 0;
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+      cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
+#endif
+    } else {  // symbol_count == 3.
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    }
+  }
+found:
+  // End of unrolled do-while loop.
+
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImpl8(const uint16_t* const cdf) {
+  assert(cdf[7] == 0);
+  uint32_t curr = values_in_range_;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * 7;
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over.
+  //
+  // The original code is:
+  //
+  // int symbol = -1;
+  // do {
+  //   prev = curr;
+  //   curr =
+  //       (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+  //       + delta;
+  //   delta -= kMinimumProbabilityPerSymbol;
+  // } while (symbol_value < curr);
+  //
+  // The do-while loop is unrolled with eight iterations.
+  int symbol = 0;
+
+#define READ_SYMBOL_ITERATION                                                \
+  prev = curr;                                                               \
+  curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
+         delta;                                                              \
+  if (symbol_value >= curr) goto found;                                      \
+  ++symbol;                                                                  \
+  delta -= kMinimumProbabilityPerSymbol
+
+  READ_SYMBOL_ITERATION;  // Iteration 0.
+  READ_SYMBOL_ITERATION;  // Iteration 1.
+  READ_SYMBOL_ITERATION;  // Iteration 2.
+  READ_SYMBOL_ITERATION;  // Iteration 3.
+  READ_SYMBOL_ITERATION;  // Iteration 4.
+  READ_SYMBOL_ITERATION;  // Iteration 5.
+
+  // The last two iterations can be simplified, so they don't use the
+  // READ_SYMBOL_ITERATION macro.
+#undef READ_SYMBOL_ITERATION
+
+  // Iteration 6.
+  prev = curr;
+  curr =
+      (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) goto found;  // symbol == 6.
+  ++symbol;
+  // |delta| is 0 for the last iteration.
+  // Iteration 7.
+  prev = curr;
+  // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == 7.
+found:
+  // End of unrolled do-while loop.
+
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+void DaalaBitReader::PopulateBits() {
+  constexpr int kMaxCachedBits = kWindowSize - 16;
+#if defined(__aarch64__)
+  // Fast path: read eight bytes and add the first six bytes to window_diff_.
+  // This fast path makes the following assumptions.
+  // 1. We assume that unaligned load of uint64_t is fast.
+  // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
+  //    bytes depending on the value of bits_. This fast path always reads 6
+  //    bytes, which results in more calls to PopulateBits(). We assume that
+  //    making more calls to a faster PopulateBits() is overall a win.
+  // NOTE: Although this fast path could also be used on x86_64, it hurts
+  // performance (measured on Lenovo ThinkStation P920 running Linux). (The
+  // reason is still unknown.) Therefore this fast path is only used on arm64.
+  static_assert(kWindowSize == 64, "");
+  if (data_ < data_memcpy_end_) {
+    uint64_t value;
+    // arm64 supports unaligned loads, so this memcpy call is compiled to a
+    // single ldr instruction.
+    memcpy(&value, data_, sizeof(value));
+    data_ += kMaxCachedBits >> 3;
+    value = HostToBigEndian(value) ^ -1;
+    value >>= kWindowSize - kMaxCachedBits;
+    window_diff_ = value | (window_diff_ << kMaxCachedBits);
+    bits_ += kMaxCachedBits;
+    return;
+  }
+#endif
+
+  const uint8_t* data = data_;
+  int bits = bits_;
+  WindowSize window_diff = window_diff_;
+
+  int count = kWindowSize - 9 - (bits + 15);
+  // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
+  // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
+  // iterations when WindowSize is 64 bits. So it is not profitable to
+  // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
+  // the fast path above is not compiled.
+
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable)
+#endif
+  for (; count >= 0 && data < data_end_; count -= 8) {
+    const uint8_t value = *data++ ^ -1;
+    window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
+    bits += 8;
+  }
+  assert(bits <= kMaxCachedBits);
+  if (data == data_end_) {
+    // Shift in some 1s. This is equivalent to providing fake 0 data bits.
+    window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
+    bits = kMaxCachedBits;
+  }
+
+  data_ = data;
+  bits_ = bits;
+  window_diff_ = window_diff;
+}
+
+void DaalaBitReader::NormalizeRange() {
+  const int bits_used = 15 ^ FloorLog2(values_in_range_);
+  bits_ -= bits_used;
+  values_in_range_ <<= bits_used;
+  if (bits_ < 0) PopulateBits();
+}
+
+// Explicit instantiations.
+template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
+
+}  // namespace libgav1
diff --git a/src/utils/entropy_decoder.h b/src/utils/entropy_decoder.h
new file mode 100644
index 0000000..c066b98
--- /dev/null
+++ b/src/utils/entropy_decoder.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+#define LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+class DaalaBitReader : public BitReader {
+ public:
+  // WindowSize must be an unsigned integer type with at least 32 bits. Use the
+  // largest type with fast arithmetic. size_t should meet these requirements.
+  using WindowSize = size_t;
+
+  DaalaBitReader(const uint8_t* data, size_t size, bool allow_update_cdf);
+  ~DaalaBitReader() override = default;
+
+  // Move only.
+  DaalaBitReader(DaalaBitReader&& rhs) noexcept;
+  DaalaBitReader& operator=(DaalaBitReader&& rhs) noexcept;
+
+  int ReadBit() final;
+  int64_t ReadLiteral(int num_bits) override;
+  // ReadSymbol() calls for which the |symbol_count| is only known at runtime
+  // will use this variant.
+  int ReadSymbol(uint16_t* cdf, int symbol_count);
+  // ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean
+  // symbols) will use this variant.
+  bool ReadSymbol(uint16_t* cdf);
+  bool ReadSymbolWithoutCdfUpdate(uint16_t cdf);
+  // Use either linear search or binary search for decoding the symbol depending
+  // on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known
+  // at compile time will use this variant.
+  template <int symbol_count>
+  int ReadSymbol(uint16_t* cdf);
+
+ private:
+  static constexpr int kWindowSize = static_cast<int>(sizeof(WindowSize)) * 8;
+  static_assert(kWindowSize >= 32, "");
+
+  // Reads a symbol using the |cdf| table which contains the probabilities of
+  // each symbol. On a high level, this function does the following:
+  //   1) Scale the |cdf| values.
+  //   2) Find the index in the |cdf| array where the scaled CDF value crosses
+  //   the modified |window_diff_| threshold.
+  //   3) That index is the symbol that has been decoded.
+  //   4) Update |window_diff_| and |values_in_range_| based on the symbol that
+  //   has been decoded.
+  inline int ReadSymbolImpl(const uint16_t* cdf, int symbol_count);
+  // Similar to ReadSymbolImpl but it uses binary search to perform step 2 in
+  // the comment above. As of now, this function is called when |symbol_count|
+  // is greater than or equal to 14.
+  inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count);
+  // Specialized implementation of ReadSymbolImpl based on the fact that
+  // symbol_count == 2.
+  inline int ReadSymbolImpl(uint16_t cdf);
+  // ReadSymbolN is a specialization of ReadSymbol for symbol_count == N.
+  LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count);
+  // ReadSymbolImplN is a specialization of ReadSymbolImpl for
+  // symbol_count == N.
+  LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf);
+  inline void PopulateBits();
+  // Normalizes the range so that 32768 <= |values_in_range_| < 65536. Also
+  // calls PopulateBits() if necessary.
+  inline void NormalizeRange();
+
+  const uint8_t* data_;
+  const uint8_t* const data_end_;
+  // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes
+  // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the
+  // constructor, not PopulateBits().
+  const uint8_t* const data_memcpy_end_;
+  const bool allow_update_cdf_;
+  // Number of cached bits of data in the current value.
+  int bits_;
+  // Number of values in the current range. Declared as uint32_t for better
+  // performance but only the lower 16 bits are used.
+  uint32_t values_in_range_;
+  // The difference between the high end of the current range and the coded
+  // value minus 1. The 16 bits above |bits_| of this variable are used to
+  // decode the next symbol. It is filled in whenever |bits_| is less than 0.
+  // Note this implementation differs from the spec as it trades the need to
+  // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(),
+  // which occurs less frequently.
+  WindowSize window_diff_;
+};
+
+extern template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
diff --git a/src/utils/executor.cc b/src/utils/executor.cc
new file mode 100644
index 0000000..6934057
--- /dev/null
+++ b/src/utils/executor.cc
@@ -0,0 +1,21 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/executor.h"
+
+namespace libgav1 {
+
+Executor::~Executor() = default;
+
+}  // namespace libgav1
diff --git a/src/utils/executor.h b/src/utils/executor.h
new file mode 100644
index 0000000..21abdf8
--- /dev/null
+++ b/src/utils/executor.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_EXECUTOR_H_
+#define LIBGAV1_SRC_UTILS_EXECUTOR_H_
+
+#include <functional>
+
+namespace libgav1 {
+
+class Executor {
+ public:
+  virtual ~Executor();
+
+  // Schedules the specified "callback" for execution in this executor.
+  // Depending on the subclass implementation, this may block in some
+  // situations.
+  virtual void Schedule(std::function<void()> callback) = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_EXECUTOR_H_
diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake
new file mode 100644
index 0000000..8b6ec4b
--- /dev/null
+++ b/src/utils/libgav1_utils.cmake
@@ -0,0 +1,72 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_)
+  return()
+endif() # LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_
+set(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ 1)
+
+list(APPEND libgav1_utils_sources
+            "${libgav1_source}/utils/array_2d.h"
+            "${libgav1_source}/utils/bit_mask_set.h"
+            "${libgav1_source}/utils/bit_reader.cc"
+            "${libgav1_source}/utils/bit_reader.h"
+            "${libgav1_source}/utils/block_parameters_holder.cc"
+            "${libgav1_source}/utils/block_parameters_holder.h"
+            "${libgav1_source}/utils/blocking_counter.h"
+            "${libgav1_source}/utils/common.h"
+            "${libgav1_source}/utils/compiler_attributes.h"
+            "${libgav1_source}/utils/constants.cc"
+            "${libgav1_source}/utils/constants.h"
+            "${libgav1_source}/utils/cpu.cc"
+            "${libgav1_source}/utils/cpu.h"
+            "${libgav1_source}/utils/dynamic_buffer.h"
+            "${libgav1_source}/utils/entropy_decoder.cc"
+            "${libgav1_source}/utils/entropy_decoder.h"
+            "${libgav1_source}/utils/executor.cc"
+            "${libgav1_source}/utils/executor.h"
+            "${libgav1_source}/utils/logging.cc"
+            "${libgav1_source}/utils/logging.h"
+            "${libgav1_source}/utils/memory.h"
+            "${libgav1_source}/utils/parameter_tree.cc"
+            "${libgav1_source}/utils/parameter_tree.h"
+            "${libgav1_source}/utils/queue.h"
+            "${libgav1_source}/utils/raw_bit_reader.cc"
+            "${libgav1_source}/utils/raw_bit_reader.h"
+            "${libgav1_source}/utils/reference_info.h"
+            "${libgav1_source}/utils/segmentation.cc"
+            "${libgav1_source}/utils/segmentation.h"
+            "${libgav1_source}/utils/segmentation_map.cc"
+            "${libgav1_source}/utils/segmentation_map.h"
+            "${libgav1_source}/utils/stack.h"
+            "${libgav1_source}/utils/threadpool.cc"
+            "${libgav1_source}/utils/threadpool.h"
+            "${libgav1_source}/utils/types.h"
+            "${libgav1_source}/utils/unbounded_queue.h"
+            "${libgav1_source}/utils/vector.h")
+
+macro(libgav1_add_utils_targets)
+  libgav1_add_library(NAME
+                      libgav1_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_include_paths}
+                      ${libgav1_gtest_include_paths})
+
+endmacro()
diff --git a/src/utils/logging.cc b/src/utils/logging.cc
new file mode 100644
index 0000000..9a43c22
--- /dev/null
+++ b/src/utils/logging.cc
@@ -0,0 +1,65 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/logging.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <sstream>
+#include <thread>  // NOLINT (unapproved c++11 header)
+
+#if !defined(LIBGAV1_LOG_LEVEL)
+#define LIBGAV1_LOG_LEVEL (1 << 30)
+#endif
+
+namespace libgav1 {
+namespace internal {
+#if LIBGAV1_ENABLE_LOGGING
+namespace {
+
+const char* LogSeverityName(LogSeverity severity) {
+  switch (severity) {
+    case LogSeverity::kInfo:
+      return "INFO";
+    case LogSeverity::kError:
+      return "ERROR";
+    case LogSeverity::kWarning:
+      return "WARNING";
+  }
+  return "UNKNOWN";
+}
+
+}  // namespace
+
+void Log(LogSeverity severity, const char* file, int line, const char* format,
+         ...) {
+  if (LIBGAV1_LOG_LEVEL < static_cast<int>(severity)) return;
+  std::ostringstream ss;
+  ss << std::hex << std::this_thread::get_id();
+  fprintf(stderr, "%s %s %s:%d] ", LogSeverityName(severity), ss.str().c_str(),
+          file, line);
+
+  va_list ap;
+  va_start(ap, format);
+  vfprintf(stderr, format, ap);
+  va_end(ap);
+  fprintf(stderr, "\n");
+}
+#else  // !LIBGAV1_ENABLE_LOGGING
+void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
+         const char* /*format*/, ...) {}
+#endif  // LIBGAV1_ENABLE_LOGGING
+
+}  // namespace internal
+}  // namespace libgav1
diff --git a/src/utils/logging.h b/src/utils/logging.h
new file mode 100644
index 0000000..48928db
--- /dev/null
+++ b/src/utils/logging.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_LOGGING_H_
+#define LIBGAV1_SRC_UTILS_LOGGING_H_
+
+#include <cstddef>
+
+#include "src/utils/compiler_attributes.h"
+
+#if !defined(LIBGAV1_ENABLE_LOGGING)
+#if defined(NDEBUG) || defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_ENABLE_LOGGING
+// LIBGAV1_DLOG(severity, printf-format-string)
+// Debug logging that can optionally be enabled in release builds by explicitly
+// setting LIBGAV1_ENABLE_LOGGING.
+// Severity is given as an all-caps version of enum LogSeverity with the
+// leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
+#define LIBGAV1_DLOG(severity, ...)                                       \
+  do {                                                                    \
+    constexpr const char* libgav1_logging_internal_basename =             \
+        ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
+    ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity,         \
+                             libgav1_logging_internal_basename, __LINE__, \
+                             __VA_ARGS__);                                \
+  } while (0)
+#else
+#define LIBGAV1_DLOG(severity, ...) \
+  do {                              \
+  } while (0)
+#endif  // LIBGAV1_ENABLE_LOGGING
+
+#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_WARNING \
+  ::libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo
+
+namespace libgav1 {
+namespace internal {
+
+enum class LogSeverity : int {
+  kError,
+  kWarning,
+  kInfo,
+};
+
+// Helper function to implement LIBGAV1_DLOG
+// Logs |format, ...| at |severity| level, reporting it as called from
+// |file|:|line|.
+void Log(libgav1::internal::LogSeverity severity, const char* file, int line,
+         const char* format, ...) LIBGAV1_PRINTF_ATTRIBUTE(4, 5);
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+  return (offset == 0 || file_name[offset - 1] == '/' ||
+          file_name[offset - 1] == '\\')
+             ? file_name + offset
+             : Basename(file_name, offset - 1);
+}
+
+}  // namespace internal
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_LOGGING_H_
diff --git a/src/utils/memory.h b/src/utils/memory.h
new file mode 100644
index 0000000..219a83f
--- /dev/null
+++ b/src/utils/memory.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_MEMORY_H_
+#define LIBGAV1_SRC_UTILS_MEMORY_H_
+
+#if defined(__ANDROID__) || defined(_MSC_VER)
+#include <malloc.h>
+#endif
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+
+namespace libgav1 {
+
+enum {
+// The byte alignment required for buffers used with SIMD code to be read or
+// written with aligned operations.
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+    defined(_M_X64)
+  kMaxAlignment = 32,  // extended alignment is safe on x86.
+#else
+  kMaxAlignment = alignof(max_align_t),
+#endif
+};
+
+// AlignedAlloc, AlignedFree
+//
+// void* AlignedAlloc(size_t alignment, size_t size);
+//   Allocate aligned memory.
+//   |alignment| must be a power of 2.
+//   Unlike posix_memalign(), |alignment| may be smaller than sizeof(void*).
+//   Unlike aligned_alloc(), |size| does not need to be a multiple of
+//   |alignment|.
+//   The returned pointer should be freed by AlignedFree().
+//
+// void AlignedFree(void* aligned_memory);
+//   Free aligned memory.
+
+#if defined(_MSC_VER)  // MSVC
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+  return _aligned_malloc(size, alignment);
+}
+
+inline void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+
+#else  // !defined(_MSC_VER)
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+#if defined(__ANDROID__)
+  // Although posix_memalign() was introduced in Android API level 17, it is
+  // more convenient to use memalign(). Unlike glibc, Android does not consider
+  // memalign() an obsolete function.
+  return memalign(alignment, size);
+#else  // !defined(__ANDROID__)
+  void* ptr = nullptr;
+  // posix_memalign requires that the requested alignment be at least
+  // sizeof(void*). In this case, fall back on malloc which should return
+  // memory aligned to at least the size of a pointer.
+  const size_t required_alignment = sizeof(void*);
+  if (alignment < required_alignment) return malloc(size);
+  const int error = posix_memalign(&ptr, alignment, size);
+  if (error != 0) {
+    errno = error;
+    return nullptr;
+  }
+  return ptr;
+#endif  // defined(__ANDROID__)
+}
+
+inline void AlignedFree(void* aligned_memory) { free(aligned_memory); }
+
+#endif  // defined(_MSC_VER)
+
+inline void Memset(uint8_t* const dst, int value, size_t count) {
+  memset(dst, value, count);
+}
+
+inline void Memset(uint16_t* const dst, int value, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    dst[i] = static_cast<uint16_t>(value);
+  }
+}
+
+struct MallocDeleter {
+  void operator()(void* ptr) const { free(ptr); }
+};
+
+struct AlignedDeleter {
+  void operator()(void* ptr) const { AlignedFree(ptr); }
+};
+
+template <typename T>
+using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
+
+// Allocates aligned memory for an array of |count| elements of type T.
+template <typename T>
+inline AlignedUniquePtr<T> MakeAlignedUniquePtr(size_t alignment,
+                                                size_t count) {
+  return AlignedUniquePtr<T>(
+      static_cast<T*>(AlignedAlloc(alignment, count * sizeof(T))));
+}
+
+// A base class with custom new and delete operators. The exception-throwing
+// new operators are deleted. The "new (std::nothrow)" form must be used.
+//
+// The new operators return nullptr if the requested size is greater than
+// 0x40000000 bytes (1 GB). TODO(wtc): Make the maximum allocable memory size
+// a compile-time configuration macro.
+//
+// See https://en.cppreference.com/w/cpp/memory/new/operator_new and
+// https://en.cppreference.com/w/cpp/memory/new/operator_delete.
+//
+// NOTE: The allocation and deallocation functions are static member functions
+// whether the keyword 'static' is used or not.
+struct Allocable {
+  // Class-specific allocation functions.
+  static void* operator new(size_t size) = delete;
+  static void* operator new[](size_t size) = delete;
+
+  // Class-specific non-throwing allocation functions
+  static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+    return ::operator new(size, tag);
+  }
+  static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+    return ::operator new[](size, tag);
+  }
+
+  // Class-specific deallocation functions.
+  static void operator delete(void* ptr) noexcept { ::operator delete(ptr); }
+  static void operator delete[](void* ptr) noexcept {
+    ::operator delete[](ptr);
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+    ::operator delete(ptr, tag);
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+    ::operator delete[](ptr, tag);
+  }
+};
+
+// A variant of Allocable that forces allocations to be aligned to
+// kMaxAlignment bytes. This is intended for use with classes that use
+// alignas() with this value. C++17 aligned new/delete are used if available,
+// otherwise we use AlignedAlloc/Free.
+struct MaxAlignedAllocable {
+  // Class-specific allocation functions.
+  static void* operator new(size_t size) = delete;
+  static void* operator new[](size_t size) = delete;
+
+  // Class-specific non-throwing allocation functions
+  static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+    return ::operator new(size, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    return AlignedAlloc(kMaxAlignment, size);
+#endif
+  }
+  static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+    return ::operator new[](size, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    return AlignedAlloc(kMaxAlignment, size);
+#endif
+  }
+
+  // Class-specific deallocation functions.
+  static void operator delete(void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete(ptr, std::align_val_t(kMaxAlignment));
+#else
+    AlignedFree(ptr);
+#endif
+  }
+  static void operator delete[](void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete[](ptr, std::align_val_t(kMaxAlignment));
+#else
+    AlignedFree(ptr);
+#endif
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete(ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    AlignedFree(ptr);
+#endif
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete[](ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    AlignedFree(ptr);
+#endif
+  }
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_MEMORY_H_
diff --git a/src/utils/parameter_tree.cc b/src/utils/parameter_tree.cc
new file mode 100644
index 0000000..9426ce6
--- /dev/null
+++ b/src/utils/parameter_tree.cc
@@ -0,0 +1,133 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/parameter_tree.h"
+
+#include <cassert>
+#include <memory>
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// static
+std::unique_ptr<ParameterTree> ParameterTree::Create(int row4x4, int column4x4,
+                                                     BlockSize block_size,
+                                                     bool is_leaf) {
+  std::unique_ptr<ParameterTree> tree(
+      new (std::nothrow) ParameterTree(row4x4, column4x4, block_size));
+  if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) {
+    tree = nullptr;
+  }
+  return tree;
+}
+
+bool ParameterTree::SetPartitionType(Partition partition) {
+  assert(!partition_type_set_);
+  partition_ = partition;
+  partition_type_set_ = true;
+  const int block_width4x4 = kNum4x4BlocksWide[block_size_];
+  const int half_block4x4 = block_width4x4 >> 1;
+  const int quarter_block4x4 = half_block4x4 >> 1;
+  const BlockSize sub_size = kSubSize[partition][block_size_];
+  const BlockSize split_size = kSubSize[kPartitionSplit][block_size_];
+  assert(partition == kPartitionNone || sub_size != kBlockInvalid);
+  switch (partition) {
+    case kPartitionNone:
+      parameters_.reset(new (std::nothrow) BlockParameters());
+      return parameters_ != nullptr;
+    case kPartitionHorizontal:
+      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
+      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+                                           sub_size, true);
+      return children_[0] != nullptr && children_[1] != nullptr;
+    case kPartitionVertical:
+      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
+      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+                                           sub_size, true);
+      return children_[0] != nullptr && children_[1] != nullptr;
+    case kPartitionSplit:
+      children_[0] =
+          ParameterTree::Create(row4x4_, column4x4_, sub_size, false);
+      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+                                           sub_size, false);
+      children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+                                           sub_size, false);
+      children_[3] = ParameterTree::Create(
+          row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false);
+      return children_[0] != nullptr && children_[1] != nullptr &&
+             children_[2] != nullptr && children_[3] != nullptr;
+    case kPartitionHorizontalWithTopSplit:
+      assert(split_size != kBlockInvalid);
+      children_[0] =
+          ParameterTree::Create(row4x4_, column4x4_, split_size, true);
+      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+                                           split_size, true);
+      children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+                                           sub_size, true);
+      return children_[0] != nullptr && children_[1] != nullptr &&
+             children_[2] != nullptr;
+    case kPartitionHorizontalWithBottomSplit:
+      assert(split_size != kBlockInvalid);
+      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
+      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+                                           split_size, true);
+      children_[2] =
+          ParameterTree::Create(row4x4_ + half_block4x4,
+                                column4x4_ + half_block4x4, split_size, true);
+      return children_[0] != nullptr && children_[1] != nullptr &&
+             children_[2] != nullptr;
+    case kPartitionVerticalWithLeftSplit:
+      assert(split_size != kBlockInvalid);
+      children_[0] =
+          ParameterTree::Create(row4x4_, column4x4_, split_size, true);
+      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+                                           split_size, true);
+      children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+                                           sub_size, true);
+      return children_[0] != nullptr && children_[1] != nullptr &&
+             children_[2] != nullptr;
+    case kPartitionVerticalWithRightSplit:
+      assert(split_size != kBlockInvalid);
+      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
+      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+                                           split_size, true);
+      children_[2] =
+          ParameterTree::Create(row4x4_ + half_block4x4,
+                                column4x4_ + half_block4x4, split_size, true);
+      return children_[0] != nullptr && children_[1] != nullptr &&
+             children_[2] != nullptr;
+    case kPartitionHorizontal4:
+      for (int i = 0; i < 4; ++i) {
+        children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4,
+                                             column4x4_, sub_size, true);
+        if (children_[i] == nullptr) return false;
+      }
+      return true;
+    default:
+      assert(partition == kPartitionVertical4);
+      for (int i = 0; i < 4; ++i) {
+        children_[i] = ParameterTree::Create(
+            row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true);
+        if (children_[i] == nullptr) return false;
+      }
+      return true;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/parameter_tree.h b/src/utils/parameter_tree.h
new file mode 100644
index 0000000..935f3eb
--- /dev/null
+++ b/src/utils/parameter_tree.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
+#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
+
+#include <cassert>
+#include <memory>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+class ParameterTree : public Allocable {
+ public:
+  // Creates a parameter tree to store the parameters of a block of size
+  // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf|
+  // is set to true, the memory will be allocated for the BlockParameters for
+  // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to
+  // false, |block_size| must be a square block, i.e.,
+  // kBlockWidthPixels[block_size] must be equal to
+  // kBlockHeightPixels[block_size].
+  static std::unique_ptr<ParameterTree> Create(int row4x4, int column4x4,
+                                               BlockSize block_size,
+                                               bool is_leaf = false);
+
+  // Move only (not Copyable).
+  ParameterTree(ParameterTree&& other) = default;
+  ParameterTree& operator=(ParameterTree&& other) = default;
+  ParameterTree(const ParameterTree&) = delete;
+  ParameterTree& operator=(const ParameterTree&) = delete;
+
+  // Set the partition type of the current node to |partition|.
+  // if (partition == kPartitionNone) {
+  //   Memory will be allocated for the BlockParameters for this node.
+  // } else if (partition != kPartitionSplit) {
+  //   The appropriate child nodes will be populated and memory will be
+  //   allocated for the BlockParameters of the children.
+  // } else {
+  //   The appropriate child nodes will be populated but they are considered to
+  //   be hanging, i.e., future calls to SetPartitionType() on the child nodes
+  //   will have to set them or their descendants to a terminal type.
+  // }
+  // This function must be called only once per node.
+  LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition);
+
+  // Basic getters.
+  int row4x4() const { return row4x4_; }
+  int column4x4() const { return column4x4_; }
+  BlockSize block_size() const { return block_size_; }
+  Partition partition() const { return partition_; }
+  ParameterTree* children(int index) const {
+    assert(index < 4);
+    return children_[index].get();
+  }
+  // Returns the BlockParameters object of the current node if one exists.
+  // Otherwise returns nullptr. This function will return a valid
+  // BlockParameters object only for leaf nodes.
+  BlockParameters* parameters() const { return parameters_.get(); }
+
+ private:
+  ParameterTree(int row4x4, int column4x4, BlockSize block_size)
+      : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {}
+
+  Partition partition_ = kPartitionNone;
+  std::unique_ptr<BlockParameters> parameters_ = nullptr;
+  int row4x4_ = -1;
+  int column4x4_ = -1;
+  BlockSize block_size_ = kBlockInvalid;
+  bool partition_type_set_ = false;
+
+  // Child values are defined as follows for various partition types:
+  //  * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr;
+  //  * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr;
+  //  * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left
+  //    partition; 3 bottom-right partition;
+  //  * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2
+  //    bottom partition; 3 nullptr;
+  //  * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2
+  //    bottom-right partition; 3 nullptr;
+  //  * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2
+  //    right partition; 3 nullptr;
+  //  * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2
+  //    bottom-right partition; 3 nullptr;
+  //  * Horizontal4: 0 top partition; 1 second top partition; 2 third top
+  //    partition; 3 bottom partition;
+  //  * Vertical4: 0 left partition; 1 second left partition; 2 third left
+  //    partition; 3 right partition;
+  std::unique_ptr<ParameterTree> children_[4] = {};
+
+  friend class ParameterTreeTest;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
diff --git a/src/utils/queue.h b/src/utils/queue.h
new file mode 100644
index 0000000..cffb9ca
--- /dev/null
+++ b/src/utils/queue.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// A FIFO queue of a fixed capacity.
+//
+// WARNING: No error checking is performed.
+template <typename T>
+class Queue {
+ public:
+  LIBGAV1_MUST_USE_RESULT bool Init(size_t capacity) {
+    elements_.reset(new (std::nothrow) T[capacity]);
+    if (elements_ == nullptr) return false;
+    capacity_ = capacity;
+    return true;
+  }
+
+  // Pushes the element |value| to the end of the queue. It is an error to call
+  // Push() when the queue is full.
+  void Push(T&& value) {
+    assert(size_ < capacity_);
+    elements_[end_++] = std::move(value);
+    if (end_ == capacity_) end_ = 0;
+    ++size_;
+  }
+
+  // Removes the element at the front of the queue. It is an error to call Pop()
+  // when the queue is empty.
+  void Pop() {
+    assert(size_ != 0);
+    const T element = std::move(elements_[begin_++]);
+    static_cast<void>(element);
+    if (begin_ == capacity_) begin_ = 0;
+    --size_;
+  }
+
+  // Returns a reference to the element at the front of the queue. It is an
+  // error to call Front() when the queue is empty.
+  T& Front() {
+    assert(size_ != 0);
+    return elements_[begin_];
+  }
+
+  // Returns a reference to the element at the back of the queue. It is an error
+  // to call Back() when the queue is empty.
+  T& Back() {
+    assert(size_ != 0);
+    const size_t back = ((end_ == 0) ? capacity_ : end_) - 1;
+    return elements_[back];
+  }
+
+  // Clears the queue.
+  void Clear() {
+    while (!Empty()) {
+      Pop();
+    }
+  }
+
+  // Returns true if the queue is empty.
+  bool Empty() const { return size_ == 0; }
+
+  // Returns true if the queue is full.
+  bool Full() const { return size_ >= capacity_; }
+
+  // Returns the number of elements in the queue.
+  size_t Size() const { return size_; }
+
+ private:
+  // An array of |capacity| elements. Used as a circular array.
+  std::unique_ptr<T[]> elements_;
+  size_t capacity_ = 0;
+  // The index of the element to be removed by Pop().
+  size_t begin_ = 0;
+  // The index where the new element is inserted by Push().
+  size_t end_ = 0;
+  size_t size_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_QUEUE_H_
diff --git a/src/utils/raw_bit_reader.cc b/src/utils/raw_bit_reader.cc
new file mode 100644
index 0000000..15e980d
--- /dev/null
+++ b/src/utils/raw_bit_reader.cc
@@ -0,0 +1,224 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/raw_bit_reader.h"
+
+#include <cassert>
+#include <limits>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+// Note <cinttypes> is only needed when logging is enabled (for the PRI*
+// macros). It depends on the definition of LIBGAV1_ENABLE_LOGGING from
+// logging.h, thus the non-standard header ordering.
+#if LIBGAV1_ENABLE_LOGGING
+#include <cinttypes>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaximumLeb128Size = 8;
+constexpr uint8_t kLeb128ValueByteMask = 0x7f;
+constexpr uint8_t kLeb128TerminationByteMask = 0x80;
+
+uint8_t Mod8(size_t n) {
+  // Last 3 bits are the value of mod 8.
+  return n & 0x07;
+}
+
+size_t DivideBy8(size_t n, bool ceil) { return (n + (ceil ? 7 : 0)) >> 3; }
+
+}  // namespace
+
+RawBitReader::RawBitReader(const uint8_t* data, size_t size)
+    : data_(data), bit_offset_(0), size_(size) {
+  assert(data_ != nullptr || size_ == 0);
+}
+
+int RawBitReader::ReadBitImpl() {
+  const size_t byte_offset = DivideBy8(bit_offset_, false);
+  const uint8_t byte = data_[byte_offset];
+  const uint8_t shift = 7 - Mod8(bit_offset_);
+  ++bit_offset_;
+  return static_cast<int>((byte >> shift) & 0x01);
+}
+
+int RawBitReader::ReadBit() {
+  if (Finished()) return -1;
+  return ReadBitImpl();
+}
+
+int64_t RawBitReader::ReadLiteral(int num_bits) {
+  assert(num_bits <= 32);
+  if (!CanReadLiteral(num_bits)) return -1;
+  assert(num_bits > 0);
+  uint32_t literal = 0;
+  int bit = num_bits - 1;
+  do {
+    // ARM can combine a shift operation with a constant number of bits with
+    // some other operations, such as the OR operation.
+    // Here is an ARM disassembly example:
+    // orr w1, w0, w1, lsl #1
+    // which left shifts register w1 by 1 bit and OR the shift result with
+    // register w0.
+    // The next 2 lines are equivalent to:
+    // literal |= static_cast<uint32_t>(ReadBitImpl()) << bit;
+    literal <<= 1;
+    literal |= static_cast<uint32_t>(ReadBitImpl());
+  } while (--bit >= 0);
+  return literal;
+}
+
+bool RawBitReader::ReadInverseSignedLiteral(int num_bits, int* const value) {
+  assert(num_bits + 1 < 32);
+  *value = static_cast<int>(ReadLiteral(num_bits + 1));
+  if (*value == -1) return false;
+  const int sign_bit = 1 << num_bits;
+  if ((*value & sign_bit) != 0) {
+    *value -= 2 * sign_bit;
+  }
+  return true;
+}
+
+bool RawBitReader::ReadLittleEndian(int num_bytes, size_t* const value) {
+  // We must be at a byte boundary.
+  assert(Mod8(bit_offset_) == 0);
+  assert(num_bytes <= 4);
+  static_assert(sizeof(size_t) >= 4, "");
+  if (value == nullptr) return false;
+  size_t byte_offset = DivideBy8(bit_offset_, false);
+  if (Finished() || byte_offset + num_bytes > size_) {
+    LIBGAV1_DLOG(ERROR, "Not enough bits to read Little Endian value.");
+    return false;
+  }
+  *value = 0;
+  for (int i = 0; i < num_bytes; ++i) {
+    const size_t byte = data_[byte_offset];
+    *value |= (byte << (i * 8));
+    ++byte_offset;
+  }
+  bit_offset_ = byte_offset * 8;
+  return true;
+}
+
+bool RawBitReader::ReadUnsignedLeb128(size_t* const value) {
+  // We must be at a byte boundary.
+  assert(Mod8(bit_offset_) == 0);
+  if (value == nullptr) return false;
+  uint64_t value64 = 0;
+  for (int i = 0; i < kMaximumLeb128Size; ++i) {
+    if (Finished()) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read LEB128 value.");
+      return false;
+    }
+    const size_t byte_offset = DivideBy8(bit_offset_, false);
+    const uint8_t byte = data_[byte_offset];
+    bit_offset_ += 8;
+    value64 |= static_cast<uint64_t>(byte & kLeb128ValueByteMask) << (i * 7);
+    if ((byte & kLeb128TerminationByteMask) == 0) {
+      if (value64 != static_cast<size_t>(value64) ||
+          value64 > std::numeric_limits<uint32_t>::max()) {
+        LIBGAV1_DLOG(
+            ERROR, "LEB128 value (%" PRIu64 ") exceeded uint32_t maximum (%u).",
+            value64, std::numeric_limits<uint32_t>::max());
+        return false;
+      }
+      *value = static_cast<size_t>(value64);
+      return true;
+    }
+  }
+  LIBGAV1_DLOG(
+      ERROR,
+      "Exceeded kMaximumLeb128Size (%d) when trying to read LEB128 value",
+      kMaximumLeb128Size);
+  return false;
+}
+
+bool RawBitReader::ReadUvlc(uint32_t* const value) {
+  if (value == nullptr) return false;
+  int leading_zeros = 0;
+  while (true) {
+    const int bit = ReadBit();
+    if (bit == -1) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+      return false;
+    }
+    if (bit == 1) break;
+    ++leading_zeros;
+    if (leading_zeros == 32) {
+      LIBGAV1_DLOG(ERROR,
+                   "Exceeded maximum size (32) when trying to read uvlc value");
+      return false;
+    }
+  }
+  int literal;
+  if (leading_zeros != 0) {
+    literal = static_cast<int>(ReadLiteral(leading_zeros));
+    if (literal == -1) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+      return false;
+    }
+    literal += (1U << leading_zeros) - 1;
+  } else {
+    literal = 0;
+  }
+  *value = literal;
+  return true;
+}
+
+bool RawBitReader::AlignToNextByte() {
+  while ((bit_offset_ & 7) != 0) {
+    if (ReadBit() != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool RawBitReader::VerifyAndSkipTrailingBits(size_t num_bits) {
+  if (ReadBit() != 1) return false;
+  for (size_t i = 0; i < num_bits - 1; ++i) {
+    if (ReadBit() != 0) return false;
+  }
+  return true;
+}
+
+bool RawBitReader::SkipBytes(size_t num_bytes) {
+  // If we are not at a byte boundary, return false.
+  return ((bit_offset_ & 7) != 0) ? false : SkipBits(num_bytes * 8);
+}
+
+bool RawBitReader::SkipBits(size_t num_bits) {
+  // If the reader is already finished, return false.
+  if (Finished()) return false;
+  // If skipping |num_bits| runs out of buffer, return false.
+  const size_t bit_offset = bit_offset_ + num_bits - 1;
+  if (DivideBy8(bit_offset, false) >= size_) return false;
+  bit_offset_ += num_bits;
+  return true;
+}
+
+bool RawBitReader::CanReadLiteral(size_t num_bits) const {
+  if (Finished()) return false;
+  const size_t bit_offset = bit_offset_ + num_bits - 1;
+  return DivideBy8(bit_offset, false) < size_;
+}
+
+bool RawBitReader::Finished() const {
+  return DivideBy8(bit_offset_, false) >= size_;
+}
+
+}  // namespace libgav1
diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h
new file mode 100644
index 0000000..76e7bfa
--- /dev/null
+++ b/src/utils/raw_bit_reader.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+class RawBitReader : public BitReader, public Allocable {
+ public:
+  RawBitReader(const uint8_t* data, size_t size);
+  ~RawBitReader() override = default;
+
+  int ReadBit() override;
+  int64_t ReadLiteral(int num_bits) override;  // f(n) in the spec.
+  bool ReadInverseSignedLiteral(int num_bits,
+                                int* value);  // su(1+num_bits) in the spec.
+  bool ReadLittleEndian(int num_bytes,
+                        size_t* value);    // le(n) in the spec.
+  bool ReadUnsignedLeb128(size_t* value);  // leb128() in the spec.
+  // Reads a variable length unsigned number and stores it in |*value|. On a
+  // successful return, |*value| is in the range of 0 to UINT32_MAX − 1,
+  // inclusive.
+  bool ReadUvlc(uint32_t* value);  // uvlc() in the spec.
+  bool Finished() const;
+  size_t bit_offset() const { return bit_offset_; }
+  // Return the bytes consumed so far (rounded up).
+  size_t byte_offset() const { return (bit_offset() + 7) >> 3; }
+  size_t size() const { return size_; }
+  // Move to the next byte boundary if not already at one. Return false if any
+  // of the bits being skipped over is non-zero. Return true otherwise. If this
+  // function returns false, the reader is left in an undefined state and must
+  // not be used further. section 5.3.5.
+  bool AlignToNextByte();
+  // Make sure that the trailing bits structure is as expected and skip over it.
+  // section 5.3.4.
+  bool VerifyAndSkipTrailingBits(size_t num_bits);
+  // Skip |num_bytes| bytes. This only works if the current position is at a
+  // byte boundary. The function returns false if the current position is not at
+  // a byte boundary or if skipping |num_bytes| causes the reader to run out of
+  // buffer. Returns true otherwise.
+  bool SkipBytes(size_t num_bytes);
+  // Skip |num_bits| bits. The function returns false if skipping |num_bits|
+  // causes the reader to run out of buffer. Returns true otherwise.
+  bool SkipBits(size_t num_bits);
+
+ private:
+  // Returns true if it is safe to read a literal of size |num_bits|.
+  bool CanReadLiteral(size_t num_bits) const;
+  int ReadBitImpl();
+
+  const uint8_t* const data_;
+  size_t bit_offset_;
+  const size_t size_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
diff --git a/src/utils/reference_info.h b/src/utils/reference_info.h
new file mode 100644
index 0000000..a660791
--- /dev/null
+++ b/src/utils/reference_info.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This struct collects some members related to reference frames in one place to
+// make it easier to pass them as parameters to some dsp functions.
+struct ReferenceInfo {
+  // Initialize |motion_field_reference_frame| so that
+  // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when
+  // the updates are the same as the initialized value.
+  // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
+  // branch conditions in motion field projection.
+  // The following memory initialization of contiguous memory is very fast. It
+  // is not recommended to make the initialization multi-threaded, unless the
+  // memory which needs to be initialized in each thread is still contiguous.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) {
+    return motion_field_reference_frame.Reset(rows, columns,
+                                              /*zero_initialize=*/true) &&
+           motion_field_mv.Reset(
+               rows, columns,
+#if LIBGAV1_MSAN
+               // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only
+               // for qualified blocks. In MotionFieldProjectionKernel() dsp
+               // optimizations, it is read no matter it was set or not.
+               /*zero_initialize=*/true
+#else
+               /*zero_initialize=*/false
+#endif
+           );
+  }
+
+  // All members are used by inter frames only.
+  // For intra frames, they are not initialized.
+
+  std::array<uint8_t, kNumReferenceFrameTypes> order_hint;
+
+  // An example when |relative_distance_from| does not equal
+  // -|relative_distance_to|:
+  // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64
+  // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64
+  // This is why we need both |relative_distance_from| and
+  // |relative_distance_to|.
+  // |relative_distance_from|: Relative distances from reference frames to this
+  // frame.
+  std::array<int8_t, kNumReferenceFrameTypes> relative_distance_from;
+  // |relative_distance_to|: Relative distances to reference frames.
+  std::array<int8_t, kNumReferenceFrameTypes> relative_distance_to;
+
+  // Skip motion field projection of specific types of frames if their
+  // |relative_distance_to| is negative or too large.
+  std::array<bool, kNumReferenceFrameTypes> skip_references;
+  // Lookup table to get motion field projection division multiplier of specific
+  // types of frames. Derived from kProjectionMvDivisionLookup.
+  std::array<int16_t, kNumReferenceFrameTypes> projection_divisions;
+
+  // The current frame's |motion_field_reference_frame| and |motion_field_mv_|
+  // are guaranteed to be allocated only when refresh_frame_flags is not 0.
+  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+  // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
+  Array2D<ReferenceFrameType> motion_field_reference_frame;
+  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+  // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
+  Array2D<MotionVector> motion_field_mv;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
diff --git a/src/utils/segmentation.cc b/src/utils/segmentation.cc
new file mode 100644
index 0000000..75fa776
--- /dev/null
+++ b/src/utils/segmentation.cc
@@ -0,0 +1,31 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation.h"
+
+namespace libgav1 {
+
+const int8_t kSegmentationFeatureBits[kSegmentFeatureMax] = {8, 6, 6, 6,
+                                                             6, 3, 0, 0};
+const int kSegmentationFeatureMaxValues[kSegmentFeatureMax] = {
+    255,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    7,
+    0,
+    0};
+
+}  // namespace libgav1
diff --git a/src/utils/segmentation.h b/src/utils/segmentation.h
new file mode 100644
index 0000000..67ff74c
--- /dev/null
+++ b/src/utils/segmentation.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+extern const int8_t kSegmentationFeatureBits[kSegmentFeatureMax];
+extern const int kSegmentationFeatureMaxValues[kSegmentFeatureMax];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_SEGMENTATION_H_
diff --git a/src/utils/segmentation_map.cc b/src/utils/segmentation_map.cc
new file mode 100644
index 0000000..4284ca2
--- /dev/null
+++ b/src/utils/segmentation_map.cc
@@ -0,0 +1,49 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation_map.h"
+
+#include <cassert>
+#include <cstring>
+#include <new>
+
+namespace libgav1 {
+
+bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) {
+  rows4x4_ = rows4x4;
+  columns4x4_ = columns4x4;
+  segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]);
+  if (segment_id_buffer_ == nullptr) return false;
+  segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get());
+  return true;
+}
+
+void SegmentationMap::Clear() {
+  memset(segment_id_buffer_.get(), 0, rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::CopyFrom(const SegmentationMap& from) {
+  assert(rows4x4_ == from.rows4x4_ && columns4x4_ == from.columns4x4_);
+  memcpy(segment_id_buffer_.get(), from.segment_id_buffer_.get(),
+         rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::FillBlock(int row4x4, int column4x4, int block_width4x4,
+                                int block_height4x4, int8_t segment_id) {
+  for (int y = 0; y < block_height4x4; ++y) {
+    memset(&segment_id_[row4x4 + y][column4x4], segment_id, block_width4x4);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/segmentation_map.h b/src/utils/segmentation_map.h
new file mode 100644
index 0000000..499be24
--- /dev/null
+++ b/src/utils/segmentation_map.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// SegmentationMap stores the segment id associated with each 4x4 block in the
+// frame.
+class SegmentationMap {
+ public:
+  SegmentationMap() = default;
+
+  // Not copyable or movable
+  SegmentationMap(const SegmentationMap&) = delete;
+  SegmentationMap& operator=(const SegmentationMap&) = delete;
+
+  // Allocates an internal buffer of the given dimensions to hold the
+  // segmentation map. The memory in the buffer is not initialized. Returns
+  // true on success, false on failure (for example, out of memory).
+  LIBGAV1_MUST_USE_RESULT bool Allocate(int32_t rows4x4, int32_t columns4x4);
+
+  int8_t segment_id(int row4x4, int column4x4) const {
+    return segment_id_[row4x4][column4x4];
+  }
+
+  // Sets every element in the segmentation map to 0.
+  void Clear();
+
+  // Copies the entire segmentation map. |from| must be of the same dimensions.
+  void CopyFrom(const SegmentationMap& from);
+
+  // Sets the region of segmentation map covered by the block to |segment_id|.
+  // The block is located at |row4x4|, |column4x4| and has dimensions
+  // |block_width4x4| and |block_height4x4|.
+  void FillBlock(int row4x4, int column4x4, int block_width4x4,
+                 int block_height4x4, int8_t segment_id);
+
+ private:
+  int32_t rows4x4_ = 0;
+  int32_t columns4x4_ = 0;
+
+  // segment_id_ is a rows4x4_ by columns4x4_ 2D array. The underlying data
+  // buffer is dynamically allocated and owned by segment_id_buffer_.
+  std::unique_ptr<int8_t[]> segment_id_buffer_;
+  Array2DView<int8_t> segment_id_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
diff --git a/src/utils/stack.h b/src/utils/stack.h
new file mode 100644
index 0000000..39133b9
--- /dev/null
+++ b/src/utils/stack.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_STACK_H_
+#define LIBGAV1_SRC_UTILS_STACK_H_
+
+#include <cassert>
+#include <utility>
+
+namespace libgav1 {
+
+// A LIFO stack of a fixed capacity. The elements are moved using std::move, so
+// the element type T has to be movable.
+//
+// WARNING: No error checking is performed.
+template <typename T, int capacity>
+class Stack {
+ public:
+  // Pushes the element |value| to the top of the stack. It is an error to call
+  // Push() when the stack is full.
+  void Push(T value) {
+    ++top_;
+    assert(top_ < capacity);
+    elements_[top_] = std::move(value);
+  }
+
+  // Returns the element at the top of the stack and removes it from the stack.
+  // It is an error to call Pop() when the stack is empty.
+  T Pop() {
+    assert(top_ >= 0);
+    return std::move(elements_[top_--]);
+  }
+
+  // Returns true if the stack is empty.
+  bool Empty() const { return top_ < 0; }
+
+ private:
+  static_assert(capacity > 0, "");
+  T elements_[capacity];
+  // The array index of the top of the stack. The stack is empty if top_ is -1.
+  int top_ = -1;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_STACK_H_
diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc
new file mode 100644
index 0000000..8c8f4fe
--- /dev/null
+++ b/src/utils/threadpool.cc
@@ -0,0 +1,323 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/threadpool.h"
+
+#if defined(_MSC_VER)
+#include <process.h>
+#include <windows.h>
+#else  // defined(_MSC_VER)
+#include <pthread.h>
+#endif  // defined(_MSC_VER)
+#if defined(__ANDROID__) || defined(__GLIBC__)
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+#include <utility>
+
+#if defined(__ANDROID__)
+#include <chrono>  // NOLINT (unapproved c++11 header)
+#endif
+
+// The glibc wrapper for the gettid() system call was added in glibc 2.30.
+// Emulate it for older versions of glibc.
+#if defined(__GLIBC_PREREQ)
+#if !__GLIBC_PREREQ(2, 30)
+
+#include <sys/syscall.h>
+
+static pid_t gettid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+
+#endif
+#endif  // defined(__GLIBC_PREREQ)
+
+namespace libgav1 {
+
+#if defined(__ANDROID__)
+namespace {
+
+using Clock = std::chrono::steady_clock;
+using Duration = Clock::duration;
+constexpr Duration kBusyWaitDuration =
+    std::chrono::duration_cast<Duration>(std::chrono::duration<double>(2e-3));
+
+}  // namespace
+#endif  // defined(__ANDROID__)
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(int num_threads) {
+  return Create(/*name_prefix=*/"", num_threads);
+}
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(const char name_prefix[],
+                                               int num_threads) {
+  if (name_prefix == nullptr || num_threads <= 0) return nullptr;
+  std::unique_ptr<WorkerThread*[]> threads(new (std::nothrow)
+                                               WorkerThread*[num_threads]);
+  if (threads == nullptr) return nullptr;
+  std::unique_ptr<ThreadPool> pool(new (std::nothrow) ThreadPool(
+      name_prefix, std::move(threads), num_threads));
+  if (pool != nullptr && !pool->StartWorkers()) {
+    pool = nullptr;
+  }
+  return pool;
+}
+
+ThreadPool::ThreadPool(const char name_prefix[],
+                       std::unique_ptr<WorkerThread*[]> threads,
+                       int num_threads)
+    : threads_(std::move(threads)), num_threads_(num_threads) {
+  threads_[0] = nullptr;
+  assert(name_prefix != nullptr);
+  const size_t name_prefix_len =
+      std::min(strlen(name_prefix), sizeof(name_prefix_) - 1);
+  memcpy(name_prefix_, name_prefix, name_prefix_len);
+  name_prefix_[name_prefix_len] = '\0';
+}
+
+ThreadPool::~ThreadPool() { Shutdown(); }
+
+void ThreadPool::Schedule(std::function<void()> closure) {
+  LockMutex();
+  if (!queue_.GrowIfNeeded()) {
+    // queue_ is full and we can't grow it. Run |closure| directly.
+    UnlockMutex();
+    closure();
+    return;
+  }
+  queue_.Push(std::move(closure));
+  UnlockMutex();
+  SignalOne();
+}
+
+int ThreadPool::num_threads() const { return num_threads_; }
+
+// A simple implementation that mirrors the non-portable Thread.  We may
+// choose to expand this in the future as a portable implementation of
+// Thread, or replace it at such a time as one is implemented.
+class ThreadPool::WorkerThread : public Allocable {
+ public:
+  // Creates and starts a thread that runs pool->WorkerFunction().
+  explicit WorkerThread(ThreadPool* pool);
+
+  // Not copyable or movable.
+  WorkerThread(const WorkerThread&) = delete;
+  WorkerThread& operator=(const WorkerThread&) = delete;
+
+  // REQUIRES: Join() must have been called if Start() was called and
+  // succeeded.
+  ~WorkerThread() = default;
+
+  LIBGAV1_MUST_USE_RESULT bool Start();
+
+  // Joins with the running thread.
+  void Join();
+
+ private:
+#if defined(_MSC_VER)
+  static unsigned int __stdcall ThreadBody(void* arg);
+#else
+  static void* ThreadBody(void* arg);
+#endif
+
+  void SetupName();
+  void Run();
+
+  ThreadPool* pool_;
+#if defined(_MSC_VER)
+  HANDLE handle_;
+#else
+  pthread_t thread_;
+#endif
+};
+
+ThreadPool::WorkerThread::WorkerThread(ThreadPool* pool) : pool_(pool) {}
+
+#if defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+  // Since our code calls the C run-time library (CRT), use _beginthreadex
+  // rather than CreateThread. Microsoft documentation says "If a thread
+  // created using CreateThread calls the CRT, the CRT may terminate the
+  // process in low-memory conditions."
+  uintptr_t handle = _beginthreadex(
+      /*security=*/nullptr, /*stack_size=*/0, ThreadBody, this,
+      /*initflag=*/CREATE_SUSPENDED, /*thrdaddr=*/nullptr);
+  if (handle == 0) return false;
+  handle_ = reinterpret_cast<HANDLE>(handle);
+  ResumeThread(handle_);
+  return true;
+}
+
+void ThreadPool::WorkerThread::Join() {
+  WaitForSingleObject(handle_, INFINITE);
+  CloseHandle(handle_);
+}
+
+unsigned int ThreadPool::WorkerThread::ThreadBody(void* arg) {
+  auto* thread = static_cast<WorkerThread*>(arg);
+  thread->Run();
+  return 0;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+  // Not currently supported on Windows.
+}
+
+#else  // defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+  return pthread_create(&thread_, nullptr, ThreadBody, this) == 0;
+}
+
+void ThreadPool::WorkerThread::Join() { pthread_join(thread_, nullptr); }
+
+void* ThreadPool::WorkerThread::ThreadBody(void* arg) {
+  auto* thread = static_cast<WorkerThread*>(arg);
+  thread->Run();
+  return nullptr;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+  if (pool_->name_prefix_[0] != '\0') {
+#if defined(__APPLE__)
+    // Apple's version of pthread_setname_np takes one argument and operates on
+    // the current thread only. Also, pthread_mach_thread_np is Apple-specific.
+    // The maximum size of the |name| buffer was noted in the Chromium source
+    // code and was confirmed by experiments.
+    char name[64];
+    mach_port_t id = pthread_mach_thread_np(pthread_self());
+    int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+                      static_cast<int64_t>(id));
+    assert(rv >= 0);
+    rv = pthread_setname_np(name);
+    assert(rv == 0);
+    static_cast<void>(rv);
+#elif defined(__ANDROID__) || defined(__GLIBC__)
+    // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
+    // with error 34 (ERANGE) on Android.
+    char name[16];
+    pid_t id = gettid();
+    int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+                      static_cast<int64_t>(id));
+    assert(rv >= 0);
+    rv = pthread_setname_np(pthread_self(), name);
+    assert(rv == 0);
+    static_cast<void>(rv);
+#endif
+  }
+}
+
+#endif  // defined(_MSC_VER)
+
+void ThreadPool::WorkerThread::Run() {
+  SetupName();
+  pool_->WorkerFunction();
+}
+
+bool ThreadPool::StartWorkers() {
+  if (!queue_.Init()) return false;
+  for (int i = 0; i < num_threads_; ++i) {
+    threads_[i] = new (std::nothrow) WorkerThread(this);
+    if (threads_[i] == nullptr) return false;
+    if (!threads_[i]->Start()) {
+      delete threads_[i];
+      threads_[i] = nullptr;
+      return false;
+    }
+  }
+  return true;
+}
+
+void ThreadPool::WorkerFunction() {
+  LockMutex();
+  while (true) {
+    if (queue_.Empty()) {
+      if (exit_threads_) {
+        break;  // Queue is empty and exit was requested.
+      }
+#if defined(__ANDROID__)
+      // On android, if we go to a conditional wait right away, the CPU governor
+      // kicks in and starts shutting the cores down. So we do a very small busy
+      // wait to see if we get our next job within that period. This
+      // significantly improves the performance of common cases of tile parallel
+      // decoding. If we don't receive a job in the busy wait time, we then go
+      // to an actual conditional wait as usual.
+      UnlockMutex();
+      bool found_job = false;
+      const auto wait_start = Clock::now();
+      while (Clock::now() - wait_start < kBusyWaitDuration) {
+        LockMutex();
+        if (!queue_.Empty()) {
+          found_job = true;
+          break;
+        }
+        UnlockMutex();
+      }
+      // If |found_job| is true, we simply continue since we already hold the
+      // mutex and we know for sure that the |queue_| is not empty.
+      if (found_job) continue;
+      // Since |found_job_| was false, the mutex is not being held at this
+      // point.
+      LockMutex();
+      // Ensure that the queue is still empty.
+      if (!queue_.Empty()) continue;
+      if (exit_threads_) {
+        break;  // Queue is empty and exit was requested.
+      }
+#endif  // defined(__ANDROID__)
+      // Queue is still empty, wait for signal or broadcast.
+      Wait();
+    } else {
+      // Take a job from the queue.
+      std::function<void()> job = std::move(queue_.Front());
+      queue_.Pop();
+
+      UnlockMutex();
+      // Note that it is good practice to surround this with a try/catch so
+      // the thread pool doesn't go to hell if the job throws an exception.
+      // This is omitted here because Google3 doesn't like exceptions.
+      std::move(job)();
+      job = nullptr;
+
+      LockMutex();
+    }
+  }
+  UnlockMutex();
+}
+
+void ThreadPool::Shutdown() {
+  // Tell worker threads how to exit.
+  LockMutex();
+  exit_threads_ = true;
+  UnlockMutex();
+  SignalAll();
+
+  // Join all workers. This will block.
+  for (int i = 0; i < num_threads_; ++i) {
+    if (threads_[i] == nullptr) break;
+    threads_[i]->Join();
+    delete threads_[i];
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/threadpool.h b/src/utils/threadpool.h
new file mode 100644
index 0000000..fac875e
--- /dev/null
+++ b/src/utils/threadpool.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_THREADPOOL_H_
+#define LIBGAV1_SRC_UTILS_THREADPOOL_H_
+
+#include <functional>
+#include <memory>
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if !defined(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+#if defined(__ANDROID__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE)
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 1
+#else
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 0
+#endif
+#endif
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <mutex>               // NOLINT (unapproved c++11 header)
+#else
+// absl::Mutex & absl::CondVar are significantly faster than the pthread
+// variants on platforms other than Android. iOS may deadlock on Shutdown()
+// using absl, see b/142251739.
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#endif
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/executor.h"
+#include "src/utils/memory.h"
+#include "src/utils/unbounded_queue.h"
+
+namespace libgav1 {
+
+// An implementation of ThreadPool using POSIX threads (pthreads) or Windows
+// threads.
+//
+// - The pool allocates a fixed number of worker threads on instantiation.
+// - The worker threads will pick up work jobs as they arrive.
+// - If all workers are busy, work jobs are queued for later execution.
+//
+// The thread pool is shut down when the pool is destroyed.
+//
+// Example usage of the thread pool:
+//   {
+//     std::unique_ptr<ThreadPool> pool = ThreadPool::Create(4);
+//     for (int i = 0; i < 100; ++i) {  // Dispatch 100 jobs.
+//       pool->Schedule([&my_data]() { MyFunction(&my_data); });
+//     }
+//   } // ThreadPool gets destroyed only when all jobs are done.
+class ThreadPool : public Executor, public Allocable {
+ public:
+  // Creates the thread pool with the specified number of worker threads.
+  // If num_threads is 1, the closures are run in FIFO order.
+  static std::unique_ptr<ThreadPool> Create(int num_threads);
+
+  // Like the above factory method, but also sets the name prefix for threads.
+  static std::unique_ptr<ThreadPool> Create(const char name_prefix[],
+                                            int num_threads);
+
+  // The destructor will shut down the thread pool and all jobs are executed.
+  // Note that after shutdown, the thread pool does not accept further jobs.
+  ~ThreadPool() override;
+
+  // Adds the specified "closure" to the queue for processing. If worker threads
+  // are available, "closure" will run immediately. Otherwise "closure" is
+  // queued for later execution.
+  //
+  // NOTE: If the internal queue is full and cannot be resized because of an
+  // out-of-memory error, the current thread runs "closure" before returning
+  // from Schedule(). For our use cases, this seems better than the
+  // alternatives:
+  //   1. Return a failure status.
+  //   2. Have the current thread wait until the queue is not full.
+  void Schedule(std::function<void()> closure) override;
+
+  int num_threads() const;
+
+ private:
+  class WorkerThread;
+
+  // Creates the thread pool with the specified number of worker threads.
+  // If num_threads is 1, the closures are run in FIFO order.
+  ThreadPool(const char name_prefix[], std::unique_ptr<WorkerThread*[]> threads,
+             int num_threads);
+
+  // Starts the worker pool.
+  LIBGAV1_MUST_USE_RESULT bool StartWorkers();
+
+  void WorkerFunction();
+
+  // Shuts down the thread pool, i.e. worker threads finish their work and
+  // pick up new jobs until the queue is empty. This call will block until
+  // the shutdown is complete.
+  //
+  // Note: If a worker encounters an empty queue after this call, it will exit.
+  // Other workers might still be running, and if the queue fills up again, the
+  // thread pool will continue to operate with a decreased number of workers.
+  // It is up to the caller to prevent adding new jobs.
+  void Shutdown();
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  void LockMutex() { queue_mutex_.lock(); }
+  void UnlockMutex() { queue_mutex_.unlock(); }
+
+  void Wait() {
+    std::unique_lock<std::mutex> queue_lock(queue_mutex_, std::adopt_lock);
+    condition_.wait(queue_lock);
+    queue_lock.release();
+  }
+
+  void SignalOne() { condition_.notify_one(); }
+  void SignalAll() { condition_.notify_all(); }
+
+  std::condition_variable condition_;
+  std::mutex queue_mutex_;
+
+#else  // !LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  void LockMutex() ABSL_EXCLUSIVE_LOCK_FUNCTION() { queue_mutex_.Lock(); }
+  void UnlockMutex() ABSL_UNLOCK_FUNCTION() { queue_mutex_.Unlock(); }
+  void Wait() { condition_.Wait(&queue_mutex_); }
+  void SignalOne() { condition_.Signal(); }
+  void SignalAll() { condition_.SignalAll(); }
+
+  absl::CondVar condition_;
+  absl::Mutex queue_mutex_;
+
+#endif  // LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  UnboundedQueue<std::function<void()>> queue_ LIBGAV1_GUARDED_BY(queue_mutex_);
+  // If not all the worker threads are created, the first entry after the
+  // created worker threads is a null pointer.
+  const std::unique_ptr<WorkerThread*[]> threads_;
+
+  bool exit_threads_ LIBGAV1_GUARDED_BY(queue_mutex_) = false;
+  const int num_threads_ = 0;
+  // name_prefix_ is a C string, whose length is restricted to 16 characters,
+  // including the terminating null byte ('\0'). This restriction comes from
+  // the Linux pthread_setname_np() function.
+  char name_prefix_[16];
+};
+
+}  // namespace libgav1
+
+#undef LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+#endif  // LIBGAV1_SRC_UTILS_THREADPOOL_H_
diff --git a/src/utils/types.h b/src/utils/types.h
new file mode 100644
index 0000000..374f06b
--- /dev/null
+++ b/src/utils/types.h
@@ -0,0 +1,525 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_TYPES_H_
+#define LIBGAV1_SRC_UTILS_TYPES_H_
+
+#include <array>
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+struct MotionVector : public Allocable {
+  static constexpr int kRow = 0;
+  static constexpr int kColumn = 1;
+
+  MotionVector() = default;
+  MotionVector(const MotionVector& mv) = default;
+
+  MotionVector& operator=(const MotionVector& rhs) {
+    mv32 = rhs.mv32;
+    return *this;
+  }
+
+  bool operator==(const MotionVector& rhs) const { return mv32 == rhs.mv32; }
+
+  union {
+    // Motion vectors will always fit in int16_t and using int16_t here instead
+    // of int saves significant memory since some of the frame sized structures
+    // store motion vectors.
+    int16_t mv[2];
+    // A uint32_t view into the |mv| array. Useful for cases where both the
+    // motion vectors have to be copied or compared with a single 32 bit
+    // instruction.
+    uint32_t mv32;
+  };
+};
+
+union CompoundMotionVector {
+  CompoundMotionVector() = default;
+  CompoundMotionVector(const CompoundMotionVector& mv) = default;
+
+  CompoundMotionVector& operator=(const CompoundMotionVector& rhs) {
+    mv64 = rhs.mv64;
+    return *this;
+  }
+
+  bool operator==(const CompoundMotionVector& rhs) const {
+    return mv64 == rhs.mv64;
+  }
+
+  MotionVector mv[2];
+  // A uint64_t view into the |mv| array. Useful for cases where all the motion
+  // vectors have to be copied or compared with a single 64 bit instruction.
+  uint64_t mv64;
+};
+
+// Stores the motion information used for motion field estimation.
+struct TemporalMotionField : public Allocable {
+  Array2D<MotionVector> mv;
+  Array2D<int8_t> reference_offset;
+};
+
+// MvContexts contains the contexts used to decode portions of an inter block
+// mode info to set the y_mode field in BlockParameters.
+//
+// The contexts in the struct correspond to the ZeroMvContext, RefMvContext,
+// and NewMvContext variables in the spec.
+struct MvContexts {
+  int zero_mv;
+  int reference_mv;
+  int new_mv;
+};
+
+struct PaletteModeInfo {
+  uint8_t size[kNumPlaneTypes];
+  uint16_t color[kMaxPlanes][kMaxPaletteSize];
+};
+
+// Stores the parameters used by the prediction process. The members of the
+// struct are filled in when parsing the bitstream and used when the prediction
+// is computed. The information in this struct is associated with a single
+// block.
+// While both BlockParameters and PredictionParameters store information
+// pertaining to a Block, the only difference is that BlockParameters outlives
+// the block itself (for example, some of the variables in BlockParameters are
+// used to compute the context for reading elements in the subsequent blocks).
+struct PredictionParameters : public Allocable {
+  // Restore the index in the unsorted mv stack from the least 3 bits of sorted
+  // |weight_index_stack|.
+  const MotionVector& reference_mv(int stack_index) const {
+    return ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)];
+  }
+  const MotionVector& reference_mv(int stack_index, int mv_index) const {
+    return compound_ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)]
+        .mv[mv_index];
+  }
+
+  void IncreaseWeight(ptrdiff_t index, int weight) {
+    weight_index_stack[index] += weight << 3;
+  }
+
+  void SetWeightIndexStackEntry(int index, int weight) {
+    weight_index_stack[index] = (weight << 3) + 7 - index;
+  }
+
+  bool use_filter_intra;
+  FilterIntraPredictor filter_intra_mode;
+  int angle_delta[kNumPlaneTypes];
+  int8_t cfl_alpha_u;
+  int8_t cfl_alpha_v;
+  int max_luma_width;
+  int max_luma_height;
+  Array2D<uint8_t> color_index_map[kNumPlaneTypes];
+  bool use_intra_block_copy;
+  InterIntraMode inter_intra_mode;
+  bool is_wedge_inter_intra;
+  int wedge_index;
+  int wedge_sign;
+  bool mask_is_inverse;
+  MotionMode motion_mode;
+  CompoundPredictionType compound_prediction_type;
+  union {
+    // |ref_mv_stack| and |compound_ref_mv_stack| are not sorted after
+    // construction. reference_mv() must be called to get the correct element.
+    MotionVector ref_mv_stack[kMaxRefMvStackSize];
+    CompoundMotionVector compound_ref_mv_stack[kMaxRefMvStackSize];
+  };
+  // The least 3 bits of |weight_index_stack| store the index information, and
+  // the other bits store the weight. The index information is actually 7 -
+  // index to make the descending order sort stable (preserves the original
+  // order for elements with the same weight). Sorting an int16_t array is much
+  // faster than sorting a struct array with weight and index stored separately.
+  int16_t weight_index_stack[kMaxRefMvStackSize];
+  // In the spec, the weights of all the nearest mvs are incremented by a bonus
+  // weight which is larger than any natural weight, and later the weights of
+  // the mvs are compared with this bonus weight to determine their contexts. We
+  // replace this procedure by introducing |nearest_mv_count|, which records the
+  // count of the nearest mvs. Since all the nearest mvs are in the beginning of
+  // the mv stack, the index of a mv in the mv stack can be compared with
+  // |nearest_mv_count| to get that mv's context.
+  int nearest_mv_count;
+  int ref_mv_count;
+  int ref_mv_index;
+  MotionVector global_mv[2];
+  int num_warp_samples;
+  int warp_estimate_candidates[kMaxLeastSquaresSamples][4];
+};
+
+// A lot of BlockParameters objects are created, so the smallest type is used
+// for each field. The ranges of some fields are documented to justify why
+// their types are large enough.
+struct BlockParameters : public Allocable {
+  BlockSize size;
+  bool skip;
+  // True means that this block will use some default settings (that
+  // correspond to compound prediction) and so most of the mode info is
+  // skipped. False means that the mode info is not skipped.
+  bool skip_mode;
+  bool is_inter;
+  bool is_explicit_compound_type;  // comp_group_idx in the spec.
+  bool is_compound_type_average;   // compound_idx in the spec.
+  bool is_global_mv_block;
+  bool use_predicted_segment_id;  // only valid with temporal update enabled.
+  int8_t segment_id;              // segment_id is in the range [0, 7].
+  PredictionMode y_mode;
+  PredictionMode uv_mode;
+  TransformSize transform_size;
+  TransformSize uv_transform_size;
+  InterpolationFilter interpolation_filter[2];
+  ReferenceFrameType reference_frame[2];
+  // The index of this array is as follows:
+  //  0 - Y plane vertical filtering.
+  //  1 - Y plane horizontal filtering.
+  //  2 - U plane (both directions).
+  //  3 - V plane (both directions).
+  uint8_t deblock_filter_level[kFrameLfCount];
+  CompoundMotionVector mv;
+  PaletteModeInfo palette_mode_info;
+  // When |Tile::split_parse_and_decode_| is true, each block gets its own
+  // instance of |prediction_parameters|. When it is false, all the blocks point
+  // to |Tile::prediction_parameters_|. This field is valid only as long as the
+  // block is *being* decoded. The lifetime and usage of this field can be
+  // better understood by following its flow in tile.cc.
+  std::unique_ptr<PredictionParameters> prediction_parameters;
+};
+
+// A five dimensional array used to store the wedge masks. The dimensions are:
+//   - block_size_index (returned by GetWedgeBlockSizeIndex() in prediction.cc).
+//   - flip_sign (0 or 1).
+//   - wedge_index (0 to 15).
+//   - each of those three dimensions is a 2d array of block_width by
+//     block_height.
+using WedgeMaskArray =
+    std::array<std::array<std::array<Array2D<uint8_t>, 16>, 2>, 9>;
+
+enum GlobalMotionTransformationType : uint8_t {
+  kGlobalMotionTransformationTypeIdentity,
+  kGlobalMotionTransformationTypeTranslation,
+  kGlobalMotionTransformationTypeRotZoom,
+  kGlobalMotionTransformationTypeAffine,
+  kNumGlobalMotionTransformationTypes
+};
+
+// Global motion and warped motion parameters. See the paper for more info:
+// S. Parker, Y. Chen, D. Barker, P. de Rivaz, D. Mukherjee, "Global and locally
+// adaptive warped motion compensation in video compression", Proc. IEEE
+// International Conference on Image Processing (ICIP), pp. 275-279, Sep. 2017.
+struct GlobalMotion {
+  GlobalMotionTransformationType type;
+  int32_t params[6];
+
+  // Represent two shearing operations. Computed from |params| by SetupShear().
+  //
+  // The least significant six (= kWarpParamRoundingBits) bits are all zeros.
+  // (This means alpha, beta, gamma, and delta could be represented by a 10-bit
+  // signed integer.) The minimum value is INT16_MIN (= -32768) and the maximum
+  // value is 32704 = 0x7fc0, the largest int16_t value whose least significant
+  // six bits are all zeros.
+  //
+  // Valid warp parameters (as validated by SetupShear()) have smaller ranges.
+  // Their absolute values are less than 2^14 (= 16384). (This follows from
+  // the warpValid check at the end of Section 7.11.3.6.)
+  //
+  // NOTE: Section 7.11.3.6 of the spec allows a maximum value of 32768, which
+  // is outside the range of int16_t. When cast to int16_t, 32768 becomes
+  // -32768. This potential int16_t overflow does not matter because either
+  // 32768 or -32768 causes SetupShear() to return false,
+  int16_t alpha;
+  int16_t beta;
+  int16_t gamma;
+  int16_t delta;
+};
+
+// Loop filter parameters:
+//
+// If level[0] and level[1] are both equal to 0, the loop filter process is
+// not invoked.
+//
+// |sharpness| and |delta_enabled| are only used by the loop filter process.
+//
+// The |ref_deltas| and |mode_deltas| arrays are used not only by the loop
+// filter process but also by the reference frame update and loading
+// processes. The loop filter process uses |ref_deltas| and |mode_deltas| only
+// when |delta_enabled| is true.
+struct LoopFilter {
+  // Contains loop filter strength values in the range of [0, 63].
+  std::array<int8_t, kFrameLfCount> level;
+  // Indicates the sharpness level in the range of [0, 7].
+  int8_t sharpness;
+  // Whether the filter level depends on the mode and reference frame used to
+  // predict a block.
+  bool delta_enabled;
+  // Whether additional syntax elements were read that specify which mode and
+  // reference frame deltas are to be updated. loop_filter_delta_update field in
+  // Section 5.9.11 of the spec.
+  bool delta_update;
+  // Contains the adjustment needed for the filter level based on the chosen
+  // reference frame, in the range of [-64, 63].
+  std::array<int8_t, kNumReferenceFrameTypes> ref_deltas;
+  // Contains the adjustment needed for the filter level based on the chosen
+  // mode, in the range of [-64, 63].
+  std::array<int8_t, kLoopFilterMaxModeDeltas> mode_deltas;
+};
+
+struct Delta {
+  bool present;
+  uint8_t scale;
+  bool multi;
+};
+
+struct Cdef {
+  uint8_t damping;  // damping value from the spec + (bitdepth - 8).
+  uint8_t bits;
+  // All the strength values are the values from the spec and left shifted by
+  // (bitdepth - 8).
+  uint8_t y_primary_strength[kMaxCdefStrengths];
+  uint8_t y_secondary_strength[kMaxCdefStrengths];
+  uint8_t uv_primary_strength[kMaxCdefStrengths];
+  uint8_t uv_secondary_strength[kMaxCdefStrengths];
+};
+
+struct TileInfo {
+  bool uniform_spacing;
+  int sb_rows;
+  int sb_columns;
+  int tile_count;
+  int tile_columns_log2;
+  int tile_columns;
+  int tile_column_start[kMaxTileColumns + 1];
+  // This field is not used by libgav1, but is populated for use by some
+  // hardware decoders. So it must not be removed.
+  int tile_column_width_in_superblocks[kMaxTileColumns + 1];
+  int tile_rows_log2;
+  int tile_rows;
+  int tile_row_start[kMaxTileRows + 1];
+  // This field is not used by libgav1, but is populated for use by some
+  // hardware decoders. So it must not be removed.
+  int tile_row_height_in_superblocks[kMaxTileRows + 1];
+  int16_t context_update_id;
+  uint8_t tile_size_bytes;
+};
+
+struct LoopRestoration {
+  LoopRestorationType type[kMaxPlanes];
+  int unit_size_log2[kMaxPlanes];
+};
+
+// Stores the quantization parameters of Section 5.9.12.
+struct QuantizerParameters {
+  // base_index is in the range [0, 255].
+  uint8_t base_index;
+  int8_t delta_dc[kMaxPlanes];
+  // delta_ac[kPlaneY] is always 0.
+  int8_t delta_ac[kMaxPlanes];
+  bool use_matrix;
+  // The |matrix_level| array is used only when |use_matrix| is true.
+  // matrix_level[plane] specifies the level in the quantizer matrix that
+  // should be used for decoding |plane|. The quantizer matrix has 15 levels,
+  // from 0 to 14. The range of matrix_level[plane] is [0, 15]. If
+  // matrix_level[plane] is 15, the quantizer matrix is not used.
+  int8_t matrix_level[kMaxPlanes];
+};
+
+// The corresponding segment feature constants in the AV1 spec are named
+// SEG_LVL_xxx.
+enum SegmentFeature : uint8_t {
+  kSegmentFeatureQuantizer,
+  kSegmentFeatureLoopFilterYVertical,
+  kSegmentFeatureLoopFilterYHorizontal,
+  kSegmentFeatureLoopFilterU,
+  kSegmentFeatureLoopFilterV,
+  kSegmentFeatureReferenceFrame,
+  kSegmentFeatureSkip,
+  kSegmentFeatureGlobalMv,
+  kSegmentFeatureMax
+};
+
+struct Segmentation {
+  // 5.11.14.
+  // Returns true if the feature is enabled in the segment.
+  bool FeatureActive(int segment_id, SegmentFeature feature) const {
+    return enabled && segment_id < kMaxSegments &&
+           feature_enabled[segment_id][feature];
+  }
+
+  // Returns true if the feature is signed.
+  static bool FeatureSigned(SegmentFeature feature) {
+    // Only the first five segment features are signed, so this comparison
+    // suffices.
+    return feature <= kSegmentFeatureLoopFilterV;
+  }
+
+  bool enabled;
+  bool update_map;
+  bool update_data;
+  bool temporal_update;
+  // True if the segment id will be read before the skip syntax element. False
+  // if the skip syntax element will be read first.
+  bool segment_id_pre_skip;
+  // The highest numbered segment id that has some enabled feature. Used as
+  // the upper bound for decoding segment ids.
+  int8_t last_active_segment_id;
+
+  bool feature_enabled[kMaxSegments][kSegmentFeatureMax];
+  int16_t feature_data[kMaxSegments][kSegmentFeatureMax];
+  bool lossless[kMaxSegments];
+  // Cached values of get_qindex(1, segmentId), to be consumed by
+  // Tile::ReadTransformType(). The values are in the range [0, 255].
+  uint8_t qindex[kMaxSegments];
+};
+
+// Section 6.8.20.
+// Note: In spec, film grain section uses YCbCr to denote variable names,
+// such as num_cb_points, num_cr_points. To keep it consistent with other
+// parts of code, we use YUV, i.e., num_u_points, num_v_points, etc.
+struct FilmGrainParams {
+  bool apply_grain;
+  bool update_grain;
+  bool chroma_scaling_from_luma;
+  bool overlap_flag;
+  bool clip_to_restricted_range;
+
+  uint8_t num_y_points;  // [0, 14].
+  uint8_t num_u_points;  // [0, 10].
+  uint8_t num_v_points;  // [0, 10].
+  // Must be [0, 255]. 10/12 bit /= 4 or 16. Must be in increasing order.
+  uint8_t point_y_value[14];
+  uint8_t point_y_scaling[14];
+  uint8_t point_u_value[10];
+  uint8_t point_u_scaling[10];
+  uint8_t point_v_value[10];
+  uint8_t point_v_scaling[10];
+
+  uint8_t chroma_scaling;              // [8, 11].
+  uint8_t auto_regression_coeff_lag;   // [0, 3].
+  int8_t auto_regression_coeff_y[24];  // [-128, 127]
+  int8_t auto_regression_coeff_u[25];  // [-128, 127]
+  int8_t auto_regression_coeff_v[25];  // [-128, 127]
+  // Shift value: auto regression coeffs range
+  // 6: [-2, 2)
+  // 7: [-1, 1)
+  // 8: [-0.5, 0.5)
+  // 9: [-0.25, 0.25)
+  uint8_t auto_regression_shift;
+
+  uint16_t grain_seed;
+  int reference_index;
+  int grain_scale_shift;
+  // These multipliers are encoded as nonnegative values by adding 128 first.
+  // The 128 is subtracted during parsing.
+  int8_t u_multiplier;       // [-128, 127]
+  int8_t u_luma_multiplier;  // [-128, 127]
+  // These offsets are encoded as nonnegative values by adding 256 first. The
+  // 256 is subtracted during parsing.
+  int16_t u_offset;          // [-256, 255]
+  int8_t v_multiplier;       // [-128, 127]
+  int8_t v_luma_multiplier;  // [-128, 127]
+  int16_t v_offset;          // [-256, 255]
+};
+
+struct ObuFrameHeader {
+  uint16_t display_frame_id;
+  uint16_t current_frame_id;
+  int64_t frame_offset;
+  uint16_t expected_frame_id[kNumInterReferenceFrameTypes];
+  int32_t width;
+  int32_t height;
+  int32_t columns4x4;
+  int32_t rows4x4;
+  // The render size (render_width and render_height) is a hint to the
+  // application about the desired display size. It has no effect on the
+  // decoding process.
+  int32_t render_width;
+  int32_t render_height;
+  int32_t upscaled_width;
+  LoopRestoration loop_restoration;
+  uint32_t buffer_removal_time[kMaxOperatingPoints];
+  uint32_t frame_presentation_time;
+  // Note: global_motion[0] (for kReferenceFrameIntra) is not used.
+  std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion;
+  TileInfo tile_info;
+  QuantizerParameters quantizer;
+  Segmentation segmentation;
+  bool show_existing_frame;
+  // frame_to_show is in the range [0, 7]. Only used if show_existing_frame is
+  // true.
+  int8_t frame_to_show;
+  FrameType frame_type;
+  bool show_frame;
+  bool showable_frame;
+  bool error_resilient_mode;
+  bool enable_cdf_update;
+  bool frame_size_override_flag;
+  // The order_hint syntax element in the uncompressed header. If
+  // show_existing_frame is false, the OrderHint variable in the spec is equal
+  // to this field, and so this field can be used in place of OrderHint when
+  // show_existing_frame is known to be false, such as during tile decoding.
+  uint8_t order_hint;
+  int8_t primary_reference_frame;
+  bool render_and_frame_size_different;
+  bool use_superres;
+  uint8_t superres_scale_denominator;
+  bool allow_screen_content_tools;
+  bool allow_intrabc;
+  bool frame_refs_short_signaling;
+  // A bitmask that specifies which reference frame slots will be updated with
+  // the current frame after it is decoded.
+  uint8_t refresh_frame_flags;
+  static_assert(sizeof(ObuFrameHeader::refresh_frame_flags) * 8 ==
+                    kNumReferenceFrameTypes,
+                "");
+  bool found_reference;
+  int8_t force_integer_mv;
+  bool allow_high_precision_mv;
+  InterpolationFilter interpolation_filter;
+  bool is_motion_mode_switchable;
+  bool use_ref_frame_mvs;
+  bool enable_frame_end_update_cdf;
+  // True if all segments are losslessly encoded at the coded resolution.
+  bool coded_lossless;
+  // True if all segments are losslessly encoded at the upscaled resolution.
+  bool upscaled_lossless;
+  TxMode tx_mode;
+  // True means that the mode info for inter blocks contains the syntax
+  // element comp_mode that indicates whether to use single or compound
+  // prediction. False means that all inter blocks will use single prediction.
+  bool reference_mode_select;
+  // The frames to use for compound prediction when skip_mode is true.
+  ReferenceFrameType skip_mode_frame[2];
+  bool skip_mode_present;
+  bool reduced_tx_set;
+  bool allow_warped_motion;
+  Delta delta_q;
+  Delta delta_lf;
+  // A valid value of reference_frame_index[i] is in the range [0, 7]. -1
+  // indicates an invalid value.
+  int8_t reference_frame_index[kNumInterReferenceFrameTypes];
+  // The ref_order_hint[ i ] syntax element in the uncompressed header.
+  // Specifies the expected output order hint for each reference frame.
+  uint8_t reference_order_hint[kNumReferenceFrameTypes];
+  LoopFilter loop_filter;
+  Cdef cdef;
+  FilmGrainParams film_grain_params;
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_UTILS_TYPES_H_
diff --git a/src/utils/unbounded_queue.h b/src/utils/unbounded_queue.h
new file mode 100644
index 0000000..fa0d303
--- /dev/null
+++ b/src/utils/unbounded_queue.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+// A FIFO queue of an unbounded capacity.
+//
+// This implementation uses the general approach used in std::deque
+// implementations. See, for example,
+// https://stackoverflow.com/questions/6292332/what-really-is-a-deque-in-stl
+//
+// It is much simpler because it just needs to support the queue interface.
+// The blocks are chained into a circular list, not managed by a "map". It
+// does not shrink the internal buffer.
+//
+// An alternative implementation approach is a resizable circular array. See,
+// for example, ResizingArrayQueue.java in https://algs4.cs.princeton.edu/code/
+// and base::circular_deque in Chromium's base/containers library.
+template <typename T>
+class UnboundedQueue {
+ public:
+  UnboundedQueue() = default;
+
+  // Move only.
+  UnboundedQueue(UnboundedQueue&& other)
+      : first_block_(other.first_block_),
+        front_(other.front_),
+        last_block_(other.last_block_),
+        back_(other.back_) {
+    other.first_block_ = nullptr;
+    other.front_ = 0;
+    other.last_block_ = nullptr;
+    other.back_ = 0;
+  }
+  UnboundedQueue& operator=(UnboundedQueue&& other) {
+    if (this != &other) {
+      Destroy();
+      first_block_ = other.first_block_;
+      front_ = other.front_;
+      last_block_ = other.last_block_;
+      back_ = other.back_;
+      other.first_block_ = nullptr;
+      other.front_ = 0;
+      other.last_block_ = nullptr;
+      other.back_ = 0;
+    }
+    return *this;
+  }
+
+  ~UnboundedQueue() { Destroy(); }
+
+  // Allocates two Blocks upfront because most access patterns require at
+  // least two Blocks. Returns false if the allocation of the Blocks failed.
+  LIBGAV1_MUST_USE_RESULT bool Init() {
+    std::unique_ptr<Block> new_block0(new (std::nothrow) Block);
+    std::unique_ptr<Block> new_block1(new (std::nothrow) Block);
+    if (new_block0 == nullptr || new_block1 == nullptr) return false;
+    first_block_ = last_block_ = new_block0.release();
+    new_block1->next = first_block_;
+    last_block_->next = new_block1.release();
+    return true;
+  }
+
+  // Checks if the queue has room for a new element. If the queue is full,
+  // tries to grow it. Returns false if the queue is full and the attempt to
+  // grow it failed.
+  //
+  // NOTE: GrowIfNeeded() must be called before each call to Push(). This
+  // inconvenient design is necessary to guarantee a successful Push() call.
+  //
+  // Push(T&& value) is often called with the argument std::move(value). The
+  // moved-from object |value| won't be usable afterwards, so it would be
+  // problematic if Push(T&& value) failed and we lost access to the original
+  // |value| object.
+  LIBGAV1_MUST_USE_RESULT bool GrowIfNeeded() {
+    assert(last_block_ != nullptr);
+    if (back_ == kBlockCapacity) {
+      if (last_block_->next == first_block_) {
+        // All Blocks are in use.
+        std::unique_ptr<Block> new_block(new (std::nothrow) Block);
+        if (new_block == nullptr) return false;
+        new_block->next = first_block_;
+        last_block_->next = new_block.release();
+      }
+      last_block_ = last_block_->next;
+      back_ = 0;
+    }
+    return true;
+  }
+
+  // Pushes the element |value| to the end of the queue. It is an error to call
+  // Push() when the queue is full.
+  void Push(const T& value) {
+    assert(last_block_ != nullptr);
+    assert(back_ < kBlockCapacity);
+    T* elements = reinterpret_cast<T*>(last_block_->buffer);
+    new (&elements[back_++]) T(value);
+  }
+
+  void Push(T&& value) {
+    assert(last_block_ != nullptr);
+    assert(back_ < kBlockCapacity);
+    T* elements = reinterpret_cast<T*>(last_block_->buffer);
+    new (&elements[back_++]) T(std::move(value));
+  }
+
+  // Returns the element at the front of the queue. It is an error to call
+  // Front() when the queue is empty.
+  T& Front() {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    return elements[front_];
+  }
+
+  const T& Front() const {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    return elements[front_];
+  }
+
+  // Removes the element at the front of the queue from the queue. It is an
+  // error to call Pop() when the queue is empty.
+  void Pop() {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    elements[front_++].~T();
+    if (front_ == kBlockCapacity) {
+      // The first block has become empty.
+      front_ = 0;
+      if (first_block_ == last_block_) {
+        // Only one Block is in use. Simply reset back_.
+        back_ = 0;
+      } else {
+        first_block_ = first_block_->next;
+      }
+    }
+  }
+
+  // Returns true if the queue is empty.
+  bool Empty() const { return first_block_ == last_block_ && front_ == back_; }
+
+ private:
+  // kBlockCapacity is the maximum number of elements each Block can hold.
+  // sizeof(void*) is subtracted from 2048 to account for the |next| pointer in
+  // the Block struct.
+  //
+  // In Linux x86_64, sizeof(std::function<void()>) is 32, so each Block can
+  // hold 63 std::function<void()> objects.
+  //
+  // NOTE: The corresponding value in <deque> in libc++ revision
+  // 245b5ba3448b9d3f6de5962066557e253a6bc9a4 is:
+  //   template <class _ValueType, class _DiffType>
+  //   struct __deque_block_size {
+  //     static const _DiffType value =
+  //         sizeof(_ValueType) < 256 ? 4096 / sizeof(_ValueType) : 16;
+  //   };
+  //
+  // Note that 4096 / 256 = 16, so apparently this expression is intended to
+  // ensure the block size is at least 4096 bytes and each block can hold at
+  // least 16 elements.
+  static constexpr size_t kBlockCapacity =
+      (sizeof(T) < 128) ? (2048 - sizeof(void*)) / sizeof(T) : 16;
+
+  struct Block : public Allocable {
+    alignas(T) char buffer[kBlockCapacity * sizeof(T)];
+    Block* next;
+  };
+
+  void Destroy() {
+    if (first_block_ == nullptr) return;  // An uninitialized queue.
+
+    // First free the unused blocks, which are located after last_block and
+    // before first_block_.
+    Block* block = last_block_->next;
+    // Cut the circular list open after last_block_.
+    last_block_->next = nullptr;
+    while (block != first_block_) {
+      Block* next = block->next;
+      delete block;
+      block = next;
+    }
+
+    // Then free the used blocks. Destruct the elements in the used blocks.
+    while (block != nullptr) {
+      const size_t begin = (block == first_block_) ? front_ : 0;
+      const size_t end = (block == last_block_) ? back_ : kBlockCapacity;
+      T* elements = reinterpret_cast<T*>(block->buffer);
+      for (size_t i = begin; i < end; ++i) {
+        elements[i].~T();
+      }
+      Block* next = block->next;
+      delete block;
+      block = next;
+    }
+  }
+
+  // Blocks are chained in a circular singly-linked list. If the list of Blocks
+  // is empty, both first_block_ and last_block_ are null pointers. If the list
+  // is nonempty, first_block_ points to the first used Block and last_block_
+  // points to the last used Block.
+  //
+  // Invariant: If Init() is called and succeeds, the queue is always nonempty.
+  // This allows all methods (except the destructor) to avoid null pointer
+  // checks for first_block_ and last_block_.
+  Block* first_block_ = nullptr;
+  // The index of the element in first_block_ to be removed by Pop().
+  size_t front_ = 0;
+  Block* last_block_ = nullptr;
+  // The index in last_block_ where the new element is inserted by Push().
+  size_t back_ = 0;
+};
+
+#if !LIBGAV1_CXX17
+template <typename T>
+constexpr size_t UnboundedQueue<T>::kBlockCapacity;
+#endif
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
diff --git a/src/utils/vector.h b/src/utils/vector.h
new file mode 100644
index 0000000..e211240
--- /dev/null
+++ b/src/utils/vector.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// libgav1::Vector implementation
+
+#ifndef LIBGAV1_SRC_UTILS_VECTOR_H_
+#define LIBGAV1_SRC_UTILS_VECTOR_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace internal {
+
+static constexpr size_t kMinVectorAllocation = 16;
+
+// Returns the smallest power of two greater or equal to 'value'.
+inline size_t NextPow2(size_t value) {
+  if (value == 0) return 0;
+  --value;
+  for (size_t i = 1; i < sizeof(size_t) * 8; i *= 2) value |= value >> i;
+  return value + 1;
+}
+
+// Returns the smallest capacity greater or equal to 'value'.
+inline size_t NextCapacity(size_t value) {
+  if (value == 0) return 0;
+  if (value <= kMinVectorAllocation) return kMinVectorAllocation;
+  return NextPow2(value);
+}
+
+//------------------------------------------------------------------------------
+// Data structure equivalent to std::vector but returning false and to its last
+// valid state on memory allocation failure.
+// std::vector with a custom allocator does not fill this need without
+// exceptions.
+
+template <typename T>
+class VectorBase {
+ public:
+  using iterator = T*;
+  using const_iterator = const T*;
+
+  VectorBase() noexcept = default;
+  // Move only.
+  VectorBase(const VectorBase&) = delete;
+  VectorBase& operator=(const VectorBase&) = delete;
+  VectorBase(VectorBase&& other) noexcept
+      : items_(other.items_),
+        capacity_(other.capacity_),
+        num_items_(other.num_items_) {
+    other.items_ = nullptr;
+    other.capacity_ = 0;
+    other.num_items_ = 0;
+  }
+  VectorBase& operator=(VectorBase&& other) noexcept {
+    if (this != &other) {
+      clear();
+      free(items_);
+      items_ = other.items_;
+      capacity_ = other.capacity_;
+      num_items_ = other.num_items_;
+      other.items_ = nullptr;
+      other.capacity_ = 0;
+      other.num_items_ = 0;
+    }
+    return *this;
+  }
+  ~VectorBase() {
+    clear();
+    free(items_);
+  }
+
+  // Reallocates just enough memory if needed so that 'new_cap' items can fit.
+  LIBGAV1_MUST_USE_RESULT bool reserve(size_t new_cap) {
+    if (capacity_ < new_cap) {
+      T* const new_items = static_cast<T*>(malloc(new_cap * sizeof(T)));
+      if (new_items == nullptr) return false;
+      if (num_items_ > 0) {
+        if (std::is_trivial<T>::value) {
+          // Cast |new_items| and |items_| to void* to avoid the GCC
+          // -Wclass-memaccess warning and additionally the
+          // bugprone-undefined-memory-manipulation clang-tidy warning. The
+          // memcpy is safe because T is a trivial type.
+          memcpy(static_cast<void*>(new_items),
+                 static_cast<const void*>(items_), num_items_ * sizeof(T));
+        } else {
+          for (size_t i = 0; i < num_items_; ++i) {
+            new (&new_items[i]) T(std::move(items_[i]));
+            items_[i].~T();
+          }
+        }
+      }
+      free(items_);
+      items_ = new_items;
+      capacity_ = new_cap;
+    }
+    return true;
+  }
+
+  // Reallocates less memory so that only the existing items can fit.
+  bool shrink_to_fit() {
+    if (capacity_ == num_items_) return true;
+    if (num_items_ == 0) {
+      free(items_);
+      items_ = nullptr;
+      capacity_ = 0;
+      return true;
+    }
+    const size_t previous_capacity = capacity_;
+    capacity_ = 0;  // Force reserve() to allocate and copy.
+    if (reserve(num_items_)) return true;
+    capacity_ = previous_capacity;
+    return false;
+  }
+
+  // Constructs a new item by copy constructor. May reallocate if
+  // 'resize_if_needed'.
+  LIBGAV1_MUST_USE_RESULT bool push_back(const T& value,
+                                         bool resize_if_needed = true) {
+    if (num_items_ >= capacity_ &&
+        (!resize_if_needed ||
+         !reserve(internal::NextCapacity(num_items_ + 1)))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(value);
+    ++num_items_;
+    return true;
+  }
+
+  // Constructs a new item by copy constructor. reserve() must have been called
+  // with a sufficient capacity.
+  //
+  // WARNING: No error checking is performed.
+  void push_back_unchecked(const T& value) {
+    assert(num_items_ < capacity_);
+    new (&items_[num_items_]) T(value);
+    ++num_items_;
+  }
+
+  // Constructs a new item by move constructor. May reallocate if
+  // 'resize_if_needed'.
+  LIBGAV1_MUST_USE_RESULT bool push_back(T&& value,
+                                         bool resize_if_needed = true) {
+    if (num_items_ >= capacity_ &&
+        (!resize_if_needed ||
+         !reserve(internal::NextCapacity(num_items_ + 1)))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(std::move(value));
+    ++num_items_;
+    return true;
+  }
+
+  // Constructs a new item by move constructor. reserve() must have been called
+  // with a sufficient capacity.
+  //
+  // WARNING: No error checking is performed.
+  void push_back_unchecked(T&& value) {
+    assert(num_items_ < capacity_);
+    new (&items_[num_items_]) T(std::move(value));
+    ++num_items_;
+  }
+
+  // Constructs a new item in place by forwarding the arguments args... to the
+  // constructor. May reallocate.
+  template <typename... Args>
+  LIBGAV1_MUST_USE_RESULT bool emplace_back(Args&&... args) {
+    if (num_items_ >= capacity_ &&
+        !reserve(internal::NextCapacity(num_items_ + 1))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(std::forward<Args>(args)...);
+    ++num_items_;
+    return true;
+  }
+
+  // Destructs the last item.
+  void pop_back() {
+    --num_items_;
+    items_[num_items_].~T();
+  }
+
+  // Destructs the item at 'pos'.
+  void erase(iterator pos) { erase(pos, pos + 1); }
+
+  // Destructs the items in [first,last).
+  void erase(iterator first, iterator last) {
+    for (iterator it = first; it != last; ++it) it->~T();
+    if (last != end()) {
+      if (std::is_trivial<T>::value) {
+        // Cast |first| and |last| to void* to avoid the GCC
+        // -Wclass-memaccess warning and additionally the
+        // bugprone-undefined-memory-manipulation clang-tidy warning. The
+        // memmove is safe because T is a trivial type.
+        memmove(static_cast<void*>(first), static_cast<const void*>(last),
+                (end() - last) * sizeof(T));
+      } else {
+        for (iterator it_src = last, it_dst = first; it_src != end();
+             ++it_src, ++it_dst) {
+          new (it_dst) T(std::move(*it_src));
+          it_src->~T();
+        }
+      }
+    }
+    num_items_ -= std::distance(first, last);
+  }
+
+  // Destructs all the items.
+  void clear() { erase(begin(), end()); }
+
+  // Destroys (including deallocating) all the items.
+  void reset() {
+    clear();
+    if (!shrink_to_fit()) assert(false);
+  }
+
+  // Accessors
+  bool empty() const { return (num_items_ == 0); }
+  size_t size() const { return num_items_; }
+  size_t capacity() const { return capacity_; }
+
+  T* data() { return items_; }
+  T& front() { return items_[0]; }
+  T& back() { return items_[num_items_ - 1]; }
+  T& operator[](size_t i) { return items_[i]; }
+  T& at(size_t i) { return items_[i]; }
+  const T* data() const { return items_; }
+  const T& front() const { return items_[0]; }
+  const T& back() const { return items_[num_items_ - 1]; }
+  const T& operator[](size_t i) const { return items_[i]; }
+  const T& at(size_t i) const { return items_[i]; }
+
+  iterator begin() { return &items_[0]; }
+  const_iterator begin() const { return &items_[0]; }
+  iterator end() { return &items_[num_items_]; }
+  const_iterator end() const { return &items_[num_items_]; }
+
+  void swap(VectorBase& b) {
+    // Although not necessary here, adding "using std::swap;" and then calling
+    // swap() without namespace qualification is recommended. See Effective
+    // C++, Item 25.
+    using std::swap;
+    swap(items_, b.items_);
+    swap(capacity_, b.capacity_);
+    swap(num_items_, b.num_items_);
+  }
+
+ protected:
+  T* items_ = nullptr;
+  size_t capacity_ = 0;
+  size_t num_items_ = 0;
+};
+
+}  // namespace internal
+
+//------------------------------------------------------------------------------
+
+// Vector class that does *NOT* construct the content on resize().
+// Should be reserved to plain old data.
+template <typename T>
+class VectorNoCtor : public internal::VectorBase<T> {
+ public:
+  // Creates or destructs items so that 'new_num_items' exist.
+  // Allocated memory grows every power-of-two items.
+  LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+    using super = internal::VectorBase<T>;
+    if (super::num_items_ < new_num_items) {
+      if (super::capacity_ < new_num_items) {
+        if (!super::reserve(internal::NextCapacity(new_num_items))) {
+          return false;
+        }
+      }
+      super::num_items_ = new_num_items;
+    } else {
+      while (super::num_items_ > new_num_items) {
+        --super::num_items_;
+        super::items_[super::num_items_].~T();
+      }
+    }
+    return true;
+  }
+};
+
+// This generic vector class will call the constructors.
+template <typename T>
+class Vector : public internal::VectorBase<T> {
+ public:
+  // Constructs or destructs items so that 'new_num_items' exist.
+  // Allocated memory grows every power-of-two items.
+  LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+    using super = internal::VectorBase<T>;
+    if (super::num_items_ < new_num_items) {
+      if (super::capacity_ < new_num_items) {
+        if (!super::reserve(internal::NextCapacity(new_num_items))) {
+          return false;
+        }
+      }
+      while (super::num_items_ < new_num_items) {
+        new (&super::items_[super::num_items_]) T();
+        ++super::num_items_;
+      }
+    } else {
+      while (super::num_items_ > new_num_items) {
+        --super::num_items_;
+        super::items_[super::num_items_].~T();
+      }
+    }
+    return true;
+  }
+};
+
+//------------------------------------------------------------------------------
+
+// Define non-member swap() functions in the namespace in which VectorNoCtor
+// and Vector are implemented. See Effective C++, Item 25.
+
+template <typename T>
+void swap(VectorNoCtor<T>& a, VectorNoCtor<T>& b) {
+  a.swap(b);
+}
+
+template <typename T>
+void swap(Vector<T>& a, Vector<T>& b) {
+  a.swap(b);
+}
+
+//------------------------------------------------------------------------------
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_VECTOR_H_
diff --git a/src/version.cc b/src/version.cc
new file mode 100644
index 0000000..8d1e5a9
--- /dev/null
+++ b/src/version.cc
@@ -0,0 +1,39 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/version.h"
+
+#define LIBGAV1_TOSTRING(x) #x
+#define LIBGAV1_STRINGIFY(x) LIBGAV1_TOSTRING(x)
+#define LIBGAV1_DOT_SEPARATED(M, m, p) M##.##m##.##p
+#define LIBGAV1_DOT_SEPARATED_VERSION(M, m, p) LIBGAV1_DOT_SEPARATED(M, m, p)
+#define LIBGAV1_DOT_VERSION                                                   \
+  LIBGAV1_DOT_SEPARATED_VERSION(LIBGAV1_MAJOR_VERSION, LIBGAV1_MINOR_VERSION, \
+                                LIBGAV1_PATCH_VERSION)
+
+#define LIBGAV1_VERSION_STRING LIBGAV1_STRINGIFY(LIBGAV1_DOT_VERSION)
+
+extern "C" {
+
+int Libgav1GetVersion() { return LIBGAV1_VERSION; }
+const char* Libgav1GetVersionString() { return LIBGAV1_VERSION_STRING; }
+
+const char* Libgav1GetBuildConfiguration() {
+  // TODO(jzern): cmake can generate the detail or in other cases we could
+  // produce one based on the known defines along with the defaults based on
+  // the toolchain, e.g., LIBGAV1_ENABLE_NEON from cpu.h.
+  return "Not available.";
+}
+
+}  // extern "C"
diff --git a/src/warp_prediction.cc b/src/warp_prediction.cc
new file mode 100644
index 0000000..dd06317
--- /dev/null
+++ b/src/warp_prediction.cc
@@ -0,0 +1,244 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/warp_prediction.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWarpModelTranslationClamp = 1 << 23;
+constexpr int kWarpModelAffineClamp = 1 << 13;
+constexpr int kLargestMotionVectorDiff = 256;
+
+constexpr uint16_t kDivisorLookup[257] = {
+    16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+    15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+    15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+    14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+    13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+    13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+    13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+    12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+    12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+    11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+    11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+    11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+    10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+    10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+    10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+    9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+    9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+    9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+    9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+    9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+    8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+    8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+    8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+    8240,  8224,  8208,  8192};
+
+// Number of fractional bits of lookup in divisor lookup table.
+constexpr int kDivisorLookupBits = 8;
+// Number of fractional bits of entries in divisor lookup table.
+constexpr int kDivisorLookupPrecisionBits = 14;
+
+// 7.11.3.7.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+                                int16_t* division_shift) {
+  const int n = FloorLog2(std::abs(value));
+  const T e = std::abs(value) - (static_cast<T>(1) << n);
+  const int entry = (n > kDivisorLookupBits)
+                        ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+                        : static_cast<int>(e << (kDivisorLookupBits - n));
+  *division_shift = n + kDivisorLookupPrecisionBits;
+  *division_factor =
+      (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// 7.11.3.8.
+int LeastSquareProduct(int a, int b) { return ((a * b) >> 2) + a + b; }
+
+// 7.11.3.8.
+int DiagonalClamp(int32_t value) {
+  return Clip3(value,
+               (1 << kWarpedModelPrecisionBits) - kWarpModelAffineClamp + 1,
+               (1 << kWarpedModelPrecisionBits) + kWarpModelAffineClamp - 1);
+}
+
+// 7.11.3.8.
+int NonDiagonalClamp(int32_t value) {
+  return Clip3(value, -kWarpModelAffineClamp + 1, kWarpModelAffineClamp - 1);
+}
+
+int16_t GetShearParameter(int value) {
+  return static_cast<int16_t>(
+      LeftShift(RightShiftWithRoundingSigned(Clip3(value, INT16_MIN, INT16_MAX),
+                                             kWarpParamRoundingBits),
+                kWarpParamRoundingBits));
+}
+
+}  // namespace
+
+bool SetupShear(GlobalMotion* const warp_params) {
+  int16_t division_shift;
+  int16_t division_factor;
+  const auto* const params = warp_params->params;
+  GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+                                      &division_shift);
+  const int alpha = params[2] - (1 << kWarpedModelPrecisionBits);
+  const int beta = params[3];
+  const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+  const int gamma =
+      RightShiftWithRoundingSigned(v * division_factor, division_shift);
+  const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+  const int delta =
+      params[5] -
+      RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+      (1 << kWarpedModelPrecisionBits);
+
+  warp_params->alpha = GetShearParameter(alpha);
+  warp_params->beta = GetShearParameter(beta);
+  warp_params->gamma = GetShearParameter(gamma);
+  warp_params->delta = GetShearParameter(delta);
+  if ((4 * std::abs(warp_params->alpha) + 7 * std::abs(warp_params->beta) >=
+       (1 << kWarpedModelPrecisionBits)) ||
+      (4 * std::abs(warp_params->gamma) + 4 * std::abs(warp_params->delta) >=
+       (1 << kWarpedModelPrecisionBits))) {
+    return false;  // NOLINT (easier condition to understand).
+  }
+
+  return true;
+}
+
+bool WarpEstimation(const int num_samples, const int block_width4x4,
+                    const int block_height4x4, const int row4x4,
+                    const int column4x4, const MotionVector& mv,
+                    const int candidates[kMaxLeastSquaresSamples][4],
+                    GlobalMotion* const warp_params) {
+  // |a| fits into int32_t. To avoid cast to int64_t in the following
+  // computation, we declare |a| as int64_t.
+  int64_t a[2][2] = {};
+  int bx[2] = {};
+  int by[2] = {};
+
+  // Note: for simplicity, the spec always uses absolute coordinates
+  // in the warp estimation process. subpixel_mid_x, subpixel_mid_y,
+  // and candidates are relative to the top left of the frame.
+  // In contrast, libaom uses a mixture of coordinate systems.
+  // In av1/common/warped_motion.c:find_affine_int(). The coordinate is relative
+  // to the top left of the block.
+  // mid_y/mid_x: the row/column coordinate of the center of the block.
+  const int mid_y = MultiplyBy4(row4x4) + MultiplyBy2(block_height4x4) - 1;
+  const int mid_x = MultiplyBy4(column4x4) + MultiplyBy2(block_width4x4) - 1;
+  const int subpixel_mid_y = MultiplyBy8(mid_y);
+  const int subpixel_mid_x = MultiplyBy8(mid_x);
+  const int reference_subpixel_mid_y =
+      subpixel_mid_y + mv.mv[MotionVector::kRow];
+  const int reference_subpixel_mid_x =
+      subpixel_mid_x + mv.mv[MotionVector::kColumn];
+
+  for (int i = 0; i < num_samples; ++i) {
+    // candidates[][0] and candidates[][1] are the row/column coordinates of the
+    // sample point in this block, to the top left of the frame.
+    // candidates[][2] and candidates[][3] are the row/column coordinates of the
+    // sample point in this reference block, to the top left of the frame.
+    // sy/sx: the row/column coordinates of the sample point, with center of
+    // the block as origin.
+    const int sy = candidates[i][0] - subpixel_mid_y;
+    const int sx = candidates[i][1] - subpixel_mid_x;
+    // dy/dx: the row/column coordinates of the sample point in the reference
+    // block, with center of the reference block as origin.
+    const int dy = candidates[i][2] - reference_subpixel_mid_y;
+    const int dx = candidates[i][3] - reference_subpixel_mid_x;
+    if (std::abs(sx - dx) < kLargestMotionVectorDiff &&
+        std::abs(sy - dy) < kLargestMotionVectorDiff) {
+      a[0][0] += LeastSquareProduct(sx, sx) + 8;
+      a[0][1] += LeastSquareProduct(sx, sy) + 4;
+      a[1][1] += LeastSquareProduct(sy, sy) + 8;
+      bx[0] += LeastSquareProduct(sx, dx) + 8;
+      bx[1] += LeastSquareProduct(sy, dx) + 4;
+      by[0] += LeastSquareProduct(sx, dy) + 4;
+      by[1] += LeastSquareProduct(sy, dy) + 8;
+    }
+  }
+
+  // a[0][1] == a[1][0], because the matrix is symmetric. We don't have to
+  // compute a[1][0].
+  const int64_t determinant = a[0][0] * a[1][1] - a[0][1] * a[0][1];
+  if (determinant == 0) return false;
+
+  int16_t division_shift;
+  int16_t division_factor;
+  GenerateApproximateDivisor<int64_t>(determinant, &division_factor,
+                                      &division_shift);
+
+  division_shift -= kWarpedModelPrecisionBits;
+
+  const int64_t params_2 = a[1][1] * bx[0] - a[0][1] * bx[1];
+  const int64_t params_3 = -a[0][1] * bx[0] + a[0][0] * bx[1];
+  const int64_t params_4 = a[1][1] * by[0] - a[0][1] * by[1];
+  const int64_t params_5 = -a[0][1] * by[0] + a[0][0] * by[1];
+  auto* const params = warp_params->params;
+
+  if (division_shift <= 0) {
+    division_factor <<= -division_shift;
+    params[2] = static_cast<int32_t>(params_2) * division_factor;
+    params[3] = static_cast<int32_t>(params_3) * division_factor;
+    params[4] = static_cast<int32_t>(params_4) * division_factor;
+    params[5] = static_cast<int32_t>(params_5) * division_factor;
+  } else {
+    params[2] = RightShiftWithRoundingSigned(params_2 * division_factor,
+                                             division_shift);
+    params[3] = RightShiftWithRoundingSigned(params_3 * division_factor,
+                                             division_shift);
+    params[4] = RightShiftWithRoundingSigned(params_4 * division_factor,
+                                             division_shift);
+    params[5] = RightShiftWithRoundingSigned(params_5 * division_factor,
+                                             division_shift);
+  }
+
+  params[2] = DiagonalClamp(params[2]);
+  params[3] = NonDiagonalClamp(params[3]);
+  params[4] = NonDiagonalClamp(params[4]);
+  params[5] = DiagonalClamp(params[5]);
+
+  const int vx =
+      mv.mv[MotionVector::kColumn] * (1 << (kWarpedModelPrecisionBits - 3)) -
+      (mid_x * (params[2] - (1 << kWarpedModelPrecisionBits)) +
+       mid_y * params[3]);
+  const int vy =
+      mv.mv[MotionVector::kRow] * (1 << (kWarpedModelPrecisionBits - 3)) -
+      (mid_x * params[4] +
+       mid_y * (params[5] - (1 << kWarpedModelPrecisionBits)));
+  params[0] =
+      Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+  params[1] =
+      Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+
+  params[6] = 0;
+  params[7] = 0;
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/warp_prediction.h b/src/warp_prediction.h
new file mode 100644
index 0000000..6c86df3
--- /dev/null
+++ b/src/warp_prediction.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_WARP_PREDICTION_H_
+#define LIBGAV1_SRC_WARP_PREDICTION_H_
+
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Sets the alpha, beta, gamma, delta fields in warp_params using the
+// warp_params->params array as input (only array entries at indexes 2, 3, 4,
+// 5 are used). Returns whether alpha, beta, gamma, delta are valid.
+bool SetupShear(GlobalMotion* warp_params);  // 7.11.3.6.
+
+// Computes local warp parameters by performing a least square fit.
+// Returns whether the computed parameters are valid.
+bool WarpEstimation(int num_samples, int block_width4x4, int block_height4x4,
+                    int row4x4, int column4x4, const MotionVector& mv,
+                    const int candidates[kMaxLeastSquaresSamples][4],
+                    GlobalMotion* warp_params);  // 7.11.3.8.
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_WARP_PREDICTION_H_
diff --git a/src/yuv_buffer.cc b/src/yuv_buffer.cc
new file mode 100644
index 0000000..c74e140
--- /dev/null
+++ b/src/yuv_buffer.cc
@@ -0,0 +1,201 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/yuv_buffer.h"
+
+#include <cassert>
+#include <cstddef>
+#include <new>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+// Size conventions:
+// * Widths, heights, and border sizes are in pixels.
+// * Strides and plane sizes are in bytes.
+//
+// YuvBuffer objects may be reused through the BufferPool. Realloc() must
+// assume that data members (except buffer_alloc_ and buffer_alloc_size_) may
+// contain stale values from the previous use, and must set all data members
+// from scratch. In particular, Realloc() must not rely on the initial values
+// of data members set by the YuvBuffer constructor.
+bool YuvBuffer::Realloc(int bitdepth, bool is_monochrome, int width, int height,
+                        int8_t subsampling_x, int8_t subsampling_y,
+                        int left_border, int right_border, int top_border,
+                        int bottom_border,
+                        GetFrameBufferCallback get_frame_buffer,
+                        void* callback_private_data,
+                        void** buffer_private_data) {
+  // Only support allocating buffers that have borders that are a multiple of
+  // 2. The border restriction is required because we may subsample the
+  // borders in the chroma planes.
+  if (((left_border | right_border | top_border | bottom_border) & 1) != 0) {
+    LIBGAV1_DLOG(ERROR,
+                 "Borders must be a multiple of 2: left_border = %d, "
+                 "right_border = %d, top_border = %d, bottom_border = %d.",
+                 left_border, right_border, top_border, bottom_border);
+    return false;
+  }
+
+  // Every row in the plane buffers needs to be kFrameBufferRowAlignment-byte
+  // aligned. Since the strides are multiples of kFrameBufferRowAlignment bytes,
+  // it suffices to just make the plane buffers kFrameBufferRowAlignment-byte
+  // aligned.
+  const int plane_align = kFrameBufferRowAlignment;
+  const int uv_width =
+      is_monochrome ? 0 : SubsampledValue(width, subsampling_x);
+  const int uv_height =
+      is_monochrome ? 0 : SubsampledValue(height, subsampling_y);
+  const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+  const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+  const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+  const int uv_bottom_border =
+      is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+  if (get_frame_buffer != nullptr) {
+    assert(buffer_private_data != nullptr);
+
+    const Libgav1ImageFormat image_format =
+        ComposeImageFormat(is_monochrome, subsampling_x, subsampling_y);
+    FrameBuffer frame_buffer;
+    if (get_frame_buffer(callback_private_data, bitdepth, image_format, width,
+                         height, left_border, right_border, top_border,
+                         bottom_border, kFrameBufferRowAlignment,
+                         &frame_buffer) != kStatusOk) {
+      return false;
+    }
+
+    if (frame_buffer.plane[0] == nullptr ||
+        (!is_monochrome && frame_buffer.plane[1] == nullptr) ||
+        (!is_monochrome && frame_buffer.plane[2] == nullptr)) {
+      assert(false && "The get_frame_buffer callback malfunctioned.");
+      LIBGAV1_DLOG(ERROR, "The get_frame_buffer callback malfunctioned.");
+      return false;
+    }
+
+    stride_[kPlaneY] = frame_buffer.stride[0];
+    stride_[kPlaneU] = frame_buffer.stride[1];
+    stride_[kPlaneV] = frame_buffer.stride[2];
+    buffer_[kPlaneY] = frame_buffer.plane[0];
+    buffer_[kPlaneU] = frame_buffer.plane[1];
+    buffer_[kPlaneV] = frame_buffer.plane[2];
+    *buffer_private_data = frame_buffer.private_data;
+  } else {
+    assert(callback_private_data == nullptr);
+    assert(buffer_private_data == nullptr);
+
+    // Calculate y_stride (in bytes). It is padded to a multiple of
+    // kFrameBufferRowAlignment bytes.
+    int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+    y_stride = Align(y_stride, kFrameBufferRowAlignment);
+    // Size of the Y plane in bytes.
+    const uint64_t y_plane_size = (height + top_border + bottom_border) *
+                                      static_cast<uint64_t>(y_stride) +
+                                  (plane_align - 1);
+
+    // Calculate uv_stride (in bytes). It is padded to a multiple of
+    // kFrameBufferRowAlignment bytes.
+    int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+    uv_stride = Align(uv_stride, kFrameBufferRowAlignment);
+    // Size of the U or V plane in bytes.
+    const uint64_t uv_plane_size =
+        is_monochrome ? 0
+                      : (uv_height + uv_top_border + uv_bottom_border) *
+                                static_cast<uint64_t>(uv_stride) +
+                            (plane_align - 1);
+
+    // Allocate unaligned y_buffer, u_buffer, and v_buffer.
+    uint8_t* y_buffer = nullptr;
+    uint8_t* u_buffer = nullptr;
+    uint8_t* v_buffer = nullptr;
+
+    const uint64_t frame_size = y_plane_size + 2 * uv_plane_size;
+    if (frame_size > buffer_alloc_size_) {
+      // Allocation to hold larger frame, or first allocation.
+      if (frame_size != static_cast<size_t>(frame_size)) return false;
+
+      buffer_alloc_.reset(new (std::nothrow)
+                              uint8_t[static_cast<size_t>(frame_size)]);
+      if (buffer_alloc_ == nullptr) {
+        buffer_alloc_size_ = 0;
+        return false;
+      }
+
+      buffer_alloc_size_ = static_cast<size_t>(frame_size);
+    }
+
+    y_buffer = buffer_alloc_.get();
+    if (!is_monochrome) {
+      u_buffer = y_buffer + y_plane_size;
+      v_buffer = u_buffer + uv_plane_size;
+    }
+
+    stride_[kPlaneY] = y_stride;
+    stride_[kPlaneU] = stride_[kPlaneV] = uv_stride;
+
+    int left_border_bytes = left_border;
+    int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) {
+      left_border_bytes *= sizeof(uint16_t);
+      uv_left_border_bytes *= sizeof(uint16_t);
+    }
+#endif
+    buffer_[kPlaneY] = AlignAddr(
+        y_buffer + (top_border * y_stride) + left_border_bytes, plane_align);
+    buffer_[kPlaneU] =
+        AlignAddr(u_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+                  plane_align);
+    buffer_[kPlaneV] =
+        AlignAddr(v_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+                  plane_align);
+  }
+
+  y_width_ = width;
+  y_height_ = height;
+  left_border_[kPlaneY] = left_border;
+  right_border_[kPlaneY] = right_border;
+  top_border_[kPlaneY] = top_border;
+  bottom_border_[kPlaneY] = bottom_border;
+
+  uv_width_ = uv_width;
+  uv_height_ = uv_height;
+  left_border_[kPlaneU] = left_border_[kPlaneV] = uv_left_border;
+  right_border_[kPlaneU] = right_border_[kPlaneV] = uv_right_border;
+  top_border_[kPlaneU] = top_border_[kPlaneV] = uv_top_border;
+  bottom_border_[kPlaneU] = bottom_border_[kPlaneV] = uv_bottom_border;
+
+  subsampling_x_ = subsampling_x;
+  subsampling_y_ = subsampling_y;
+
+  bitdepth_ = bitdepth;
+  is_monochrome_ = is_monochrome;
+  assert(!is_monochrome || stride_[kPlaneU] == 0);
+  assert(!is_monochrome || stride_[kPlaneV] == 0);
+  assert(!is_monochrome || buffer_[kPlaneU] == nullptr);
+  assert(!is_monochrome || buffer_[kPlaneV] == nullptr);
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/yuv_buffer.h b/src/yuv_buffer.h
new file mode 100644
index 0000000..b9e8cd3
--- /dev/null
+++ b/src/yuv_buffer.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_YUV_BUFFER_H_
+#define LIBGAV1_SRC_YUV_BUFFER_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+class YuvBuffer {
+ public:
+  // Allocates the buffer. Returns true on success. Returns false on failure.
+  //
+  // * |width| and |height| are the image dimensions in pixels.
+  // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+  //   subsampling of the width and height of the chroma planes, respectively.
+  // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+  //   the sizes (in pixels) of the borders on the left, right, top, and
+  //   bottom sides, respectively. The four border sizes must all be a
+  //   multiple of 2.
+  // * If |get_frame_buffer| is not null, it is invoked to allocate the memory.
+  //   If |get_frame_buffer| is null, YuvBuffer allocates the memory directly
+  //   and ignores the |callback_private_data| and |buffer_private_data|
+  //   parameters, which should be null.
+  //
+  // NOTE: The strides are a multiple of 16. Since the first row in each plane
+  // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+  //
+  // Example: bitdepth=8 width=20 height=6 left/right/top/bottom_border=2. The
+  // diagram below shows how Realloc() allocates the data buffer for the Y
+  // plane.
+  //
+  //   16-byte aligned
+  //          |
+  //          v
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++01234567890123456789++pppppppp
+  //        ++11234567890123456789++pppppppp
+  //        ++21234567890123456789++pppppppp
+  //        ++31234567890123456789++pppppppp
+  //        ++41234567890123456789++pppppppp
+  //        ++51234567890123456789++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        |                              |
+  //        |<-- stride (multiple of 16) ->|
+  //
+  // The video frame has 6 rows of 20 pixels each. Each row is shown as the
+  // pattern r1234567890123456789, where |r| is 0, 1, 2, 3, 4, 5.
+  //
+  // Realloc() first adds a border of 2 pixels around the video frame. The
+  // border pixels are shown as '+'.
+  //
+  // Each row is then padded to a multiple of the default alignment in bytes,
+  // which is 16. The padding bytes are shown as lowercase 'p'. (Since
+  // |bitdepth| is 8 in this example, each pixel is one byte.) The padded size
+  // in bytes is the stride. In this example, the stride is 32 bytes.
+  //
+  // Finally, Realloc() aligns the first byte of frame data, which is the '0'
+  // pixel/byte in the upper left corner of the frame, to the default (16-byte)
+  // alignment boundary.
+  //
+  // TODO(wtc): Add a check for width and height limits to defend against
+  // invalid bitstreams.
+  bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+               int8_t subsampling_x, int8_t subsampling_y, int left_border,
+               int right_border, int top_border, int bottom_border,
+               GetFrameBufferCallback get_frame_buffer,
+               void* callback_private_data, void** buffer_private_data);
+
+  int bitdepth() const { return bitdepth_; }
+
+  bool is_monochrome() const { return is_monochrome_; }
+
+  int8_t subsampling_x() const { return subsampling_x_; }
+  int8_t subsampling_y() const { return subsampling_y_; }
+
+  int width(int plane) const {
+    return (plane == kPlaneY) ? y_width_ : uv_width_;
+  }
+  int height(int plane) const {
+    return (plane == kPlaneY) ? y_height_ : uv_height_;
+  }
+
+  // Returns border sizes in pixels.
+  int left_border(int plane) const { return left_border_[plane]; }
+  int right_border(int plane) const { return right_border_[plane]; }
+  int top_border(int plane) const { return top_border_[plane]; }
+  int bottom_border(int plane) const { return bottom_border_[plane]; }
+
+  // Returns the alignment of frame buffer row in bytes.
+  int alignment() const { return kFrameBufferRowAlignment; }
+
+  // Backup the current set of warnings and disable -Warray-bounds for the
+  // following three functions as the compiler cannot, in all cases, determine
+  // whether |plane| is within [0, kMaxPlanes), e.g., with a variable based for
+  // loop.
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+  // Returns the data buffer for |plane|.
+  uint8_t* data(int plane) {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+    return buffer_[plane];
+  }
+  const uint8_t* data(int plane) const {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+    return buffer_[plane];
+  }
+
+  // Returns the stride in bytes for |plane|.
+  int stride(int plane) const {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(stride_)>::value);
+    return stride_[plane];
+  }
+  // Restore the previous set of compiler warnings.
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+ private:
+  static constexpr int kFrameBufferRowAlignment = 16;
+  int bitdepth_ = 0;
+  bool is_monochrome_ = false;
+
+  // y_width_ and y_height_ are the |width| and |height| arguments passed to the
+  // Realloc() method.
+  //
+  // uv_width_ and uv_height_ are computed from y_width_ and y_height_ as
+  // follows:
+  //   uv_width_ = (y_width_ + subsampling_x_) >> subsampling_x_
+  //   uv_height_ = (y_height_ + subsampling_y_) >> subsampling_y_
+  int y_width_ = 0;
+  int uv_width_ = 0;
+  int y_height_ = 0;
+  int uv_height_ = 0;
+
+  int left_border_[kMaxPlanes] = {};
+  int right_border_[kMaxPlanes] = {};
+  int top_border_[kMaxPlanes] = {};
+  int bottom_border_[kMaxPlanes] = {};
+
+  int stride_[kMaxPlanes] = {};
+  uint8_t* buffer_[kMaxPlanes] = {};
+
+  // buffer_alloc_ and buffer_alloc_size_ are only used if the
+  // get_frame_buffer callback is null and we allocate the buffer ourselves.
+  std::unique_ptr<uint8_t[]> buffer_alloc_;
+  size_t buffer_alloc_size_ = 0;
+
+  int8_t subsampling_x_ = 0;  // 0 or 1.
+  int8_t subsampling_y_ = 0;  // 0 or 1.
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_YUV_BUFFER_H_
diff --git a/tests/fuzzer/decoder_fuzzer.cc b/tests/fuzzer/decoder_fuzzer.cc
new file mode 100644
index 0000000..236fd3c
--- /dev/null
+++ b/tests/fuzzer/decoder_fuzzer.cc
@@ -0,0 +1,87 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+void Decode(const uint8_t* const data, const size_t size,
+            libgav1::Decoder* const decoder) {
+  decoder->EnqueueFrame(data, size, /*user_private_data=*/0,
+                        /*buffer_private_data=*/nullptr);
+  const libgav1::DecoderBuffer* buffer;
+  decoder->DequeueFrame(&buffer);
+}
+
+}  // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings = {};
+  // Use the low byte of the width to seed the number of threads.
+  // We use both nibbles of the lower byte as this results in values != 1 much
+  // more quickly than using the lower nibble alone.
+  settings.threads = (size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1;
+  if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+  // Treat the input as a raw OBU stream.
+  Decode(data, size, &decoder);
+
+  // Use the first frame from an IVF to bypass any read errors from the parser.
+  static constexpr size_t kIvfHeaderSize =
+      libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+  if (size >= kIvfHeaderSize) {
+    Decode(data + kIvfHeaderSize, size - kIvfHeaderSize, &decoder);
+  }
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  std::vector<uint8_t> buffer;
+  int decoded_frames = 0;
+  do {
+    if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+    Decode(buffer.data(), buffer.size(), &decoder);
+    if (++decoded_frames >= kMaxFrames) break;
+  } while (!file_reader->IsEndOfFile());
+
+  return 0;
+}
diff --git a/tests/fuzzer/decoder_fuzzer_frame_parallel.cc b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
new file mode 100644
index 0000000..d1b1c54
--- /dev/null
+++ b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
@@ -0,0 +1,139 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "src/gav1/status_code.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+using InputBuffer = std::vector<uint8_t>;
+
+struct InputBuffers {
+  ~InputBuffers() {
+    for (auto& buffer : free_buffers) {
+      delete buffer;
+    }
+  }
+  std::deque<InputBuffer*> free_buffers;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+                        void* buffer_private_data) {
+  auto* const test = static_cast<InputBuffers*>(callback_private_data);
+  test->free_buffers.push_back(static_cast<InputBuffer*>(buffer_private_data));
+}
+
+}  // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  // Note that |input_buffers| has to outlive the |decoder| object since the
+  // |release_input_buffer| callback could be called on the |decoder|'s
+  // destructor.
+  InputBuffers input_buffers;
+
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings = {};
+  // Use the 33 + low byte of the width to seed the number of threads. This
+  // ensures that we will trigger the frame parallel path in most cases.
+  // We use both nibbles of the lower byte as this results in values != 1 much
+  // more quickly than using the lower nibble alone.
+  settings.threads =
+      33 + ((size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1);
+
+  settings.frame_parallel = true;
+  settings.blocking_dequeue = true;
+  settings.callback_private_data = &input_buffers;
+  settings.release_input_buffer = ReleaseInputBuffer;
+  if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  InputBuffer* input_buffer = nullptr;
+  bool dequeue_finished = false;
+
+  do {
+    if (input_buffer == nullptr && !file_reader->IsEndOfFile()) {
+      if (input_buffers.free_buffers.empty()) {
+        auto* const buffer = new (std::nothrow) InputBuffer();
+        if (buffer == nullptr) {
+          break;
+        }
+        input_buffers.free_buffers.push_back(buffer);
+      }
+      input_buffer = input_buffers.free_buffers.front();
+      input_buffers.free_buffers.pop_front();
+      if (!file_reader->ReadTemporalUnit(input_buffer, nullptr)) {
+        break;
+      }
+    }
+
+    if (input_buffer != nullptr) {
+      libgav1::StatusCode status =
+          decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+                               /*user_private_data=*/0,
+                               /*buffer_private_data=*/input_buffer);
+      if (status == libgav1::kStatusOk) {
+        input_buffer = nullptr;
+        // Continue to enqueue frames until we get a kStatusTryAgain status.
+        continue;
+      }
+      if (status != libgav1::kStatusTryAgain) {
+        break;
+      }
+    }
+
+    const libgav1::DecoderBuffer* buffer;
+    libgav1::StatusCode status = decoder.DequeueFrame(&buffer);
+    if (status == libgav1::kStatusNothingToDequeue) {
+      dequeue_finished = true;
+    } else if (status == libgav1::kStatusOk) {
+      dequeue_finished = false;
+    } else {
+      break;
+    }
+  } while (input_buffer != nullptr || !file_reader->IsEndOfFile() ||
+           !dequeue_finished);
+
+  if (input_buffer != nullptr) {
+    input_buffers.free_buffers.push_back(input_buffer);
+  }
+
+  return 0;
+}
diff --git a/tests/fuzzer/fuzzer_temp_file.h b/tests/fuzzer/fuzzer_temp_file.h
new file mode 100644
index 0000000..5d12bbe
--- /dev/null
+++ b/tests/fuzzer/fuzzer_temp_file.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+#define LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+
+// Adapter utility from fuzzer input to a temporary file, for fuzzing APIs that
+// require a file instead of an input buffer.
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+// Pure-C interface for creating and cleaning up temporary files.
+
+static char* fuzzer_get_tmpfile_with_suffix(const uint8_t* data, size_t size,
+                                            const char* suffix) {
+  if (suffix == NULL) {  // NOLINT (this could be a C compilation unit)
+    suffix = "";
+  }
+  const size_t suffix_len = strlen(suffix);
+  if (suffix_len > INT_MAX) {  // mkstemps takes int for suffixlen param
+    perror("Suffix too long");
+    abort();
+  }
+
+#ifdef __ANDROID__
+  const char* leading_temp_path =
+      "/data/local/tmp/generate_temporary_file.XXXXXX";
+#else
+  const char* leading_temp_path = "/tmp/generate_temporary_file.XXXXXX";
+#endif
+  const size_t buffer_sz = strlen(leading_temp_path) + suffix_len + 1;
+  char* filename_buffer =
+      (char*)malloc(buffer_sz);  // NOLINT (this could be a C compilation unit)
+  if (!filename_buffer) {
+    perror("Failed to allocate file name buffer.");
+    abort();
+  }
+
+  if (snprintf(filename_buffer, buffer_sz, "%s%s", leading_temp_path, suffix) >=
+      buffer_sz) {
+    perror("File name buffer too short.");
+    abort();
+  }
+
+  const int file_descriptor = mkstemps(filename_buffer, suffix_len);
+  if (file_descriptor < 0) {
+    perror("Failed to make temporary file.");
+    abort();
+  }
+  FILE* file = fdopen(file_descriptor, "wb");
+  if (!file) {
+    perror("Failed to open file descriptor.");
+    close(file_descriptor);
+    abort();
+  }
+  const size_t bytes_written = fwrite(data, sizeof(uint8_t), size, file);
+  if (bytes_written < size) {
+    close(file_descriptor);
+    fprintf(stderr, "Failed to write all bytes to file (%zu out of %zu)",
+            bytes_written, size);
+    abort();
+  }
+  fclose(file);
+  return filename_buffer;
+}
+
+static char* fuzzer_get_tmpfile(
+    const uint8_t* data,
+    size_t size) {  // NOLINT (people include this .inc file directly)
+  return fuzzer_get_tmpfile_with_suffix(data, size, NULL);  // NOLINT
+}
+
+static void fuzzer_release_tmpfile(char* filename) {
+  if (unlink(filename) != 0) {
+    perror("WARNING: Failed to delete temporary file.");
+  }
+  free(filename);
+}
+
+// C++ RAII object for creating temporary files.
+
+#ifdef __cplusplus
+class FuzzerTemporaryFile {
+ public:
+  FuzzerTemporaryFile(const uint8_t* data, size_t size)
+      : original_filename_(fuzzer_get_tmpfile(data, size)) {
+    filename_ = strdup(original_filename_);
+    if (!filename_) {
+      perror("Failed to allocate file name copy.");
+      abort();
+    }
+  }
+
+  FuzzerTemporaryFile(const uint8_t* data, size_t size, const char* suffix)
+      : original_filename_(fuzzer_get_tmpfile_with_suffix(data, size, suffix)) {
+    filename_ = strdup(original_filename_);
+    if (!filename_) {
+      perror("Failed to allocate file name copy.");
+      abort();
+    }
+  }
+
+  ~FuzzerTemporaryFile() {
+    free(filename_);
+    fuzzer_release_tmpfile(original_filename_);
+  }
+
+  FuzzerTemporaryFile(const FuzzerTemporaryFile& other) = delete;
+  FuzzerTemporaryFile operator=(const FuzzerTemporaryFile& other) = delete;
+
+  FuzzerTemporaryFile(const FuzzerTemporaryFile&& other) = delete;
+  FuzzerTemporaryFile operator=(const FuzzerTemporaryFile&& other) = delete;
+
+  const char* filename() const { return filename_; }
+
+  // Returns a mutable pointer to the file name. Should be used sparingly, only
+  // in case the fuzzed API demands it or when making a mutable copy is
+  // inconvenient (e.g., in auto-generated code).
+  char* mutable_filename() const { return filename_; }
+
+ private:
+  char* original_filename_;
+
+  // A mutable copy of the original filename, returned by the accessor. This
+  // guarantees that the original filename can always be used to release the
+  // temporary path.
+  char* filename_;
+};
+#endif  // __cplusplus
+#endif  // LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
diff --git a/tests/fuzzer/obu_parser_fuzzer.cc b/tests/fuzzer/obu_parser_fuzzer.cc
new file mode 100644
index 0000000..634a802
--- /dev/null
+++ b/tests/fuzzer/obu_parser_fuzzer.cc
@@ -0,0 +1,89 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/decoder_state.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/obu_parser.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames and obus to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+inline void ParseObu(const uint8_t* const data, size_t size) {
+  libgav1::InternalFrameBufferList buffer_list;
+  libgav1::BufferPool buffer_pool(libgav1::OnInternalFrameBufferSizeChanged,
+                                  libgav1::GetInternalFrameBuffer,
+                                  libgav1::ReleaseInternalFrameBuffer,
+                                  &buffer_list);
+  libgav1::DecoderState decoder_state;
+  libgav1::ObuParser parser(data, size, 0, &buffer_pool, &decoder_state);
+  libgav1::RefCountedBufferPtr current_frame;
+  int parsed_frames = 0;
+  while (parser.HasData()) {
+    if (parser.ParseOneFrame(&current_frame) != libgav1::kStatusOk) break;
+    if (++parsed_frames >= kMaxFrames) break;
+  }
+}
+
+}  // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  // Treat the input as a raw OBU stream.
+  ParseObu(data, size);
+
+  // Use the first frame from an IVF to bypass any read errors from the parser.
+  static constexpr size_t kIvfHeaderSize =
+      libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+  if (size >= kIvfHeaderSize) {
+    ParseObu(data + kIvfHeaderSize, size - kIvfHeaderSize);
+  }
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  std::vector<uint8_t> buffer;
+  int parsed_frames = 0;
+  do {
+    if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+    ParseObu(buffer.data(), buffer.size());
+    if (++parsed_frames >= kMaxFrames) break;
+  } while (!file_reader->IsEndOfFile());
+
+  return 0;
+}
author	qinxialei <xialeiqin@gmail.com>	2020-10-29 11:26:59 +0800
committer	qinxialei <xialeiqin@gmail.com>	2020-10-29 11:26:59 +0800
commit	e8d277081293b6fb2a5d469616baaa7a06f52496 (patch)
tree	1179bb07d3927d1837d4a90bd81b2034c4c696a9
download	libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.tar.gz libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.tar.bz2 libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.zip