aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorqinxialei <xialeiqin@gmail.com>2020-10-29 11:26:59 +0800
committerqinxialei <xialeiqin@gmail.com>2020-10-29 11:26:59 +0800
commite8d277081293b6fb2a5d469616baaa7a06f52496 (patch)
tree1179bb07d3927d1837d4a90bd81b2034c4c696a9
downloadlibgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.tar.gz
libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.tar.bz2
libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.zip
Import Upstream version 0.16.0
-rw-r--r--.gitignore2
-rw-r--r--AUTHORS6
-rw-r--r--CMakeLists.txt126
-rw-r--r--CONTRIBUTING.md27
-rw-r--r--LICENSE202
-rw-r--r--README.md168
-rw-r--r--cmake/libgav1-config.cmake.template2
-rw-r--r--cmake/libgav1.pc.template11
-rw-r--r--cmake/libgav1_build_definitions.cmake150
-rw-r--r--cmake/libgav1_cpu_detection.cmake49
-rw-r--r--cmake/libgav1_flags.cmake251
-rw-r--r--cmake/libgav1_helpers.cmake134
-rw-r--r--cmake/libgav1_install.cmake60
-rw-r--r--cmake/libgav1_intrinsics.cmake135
-rw-r--r--cmake/libgav1_options.cmake55
-rw-r--r--cmake/libgav1_sanitizer.cmake45
-rw-r--r--cmake/libgav1_targets.cmake347
-rw-r--r--cmake/libgav1_variables.cmake78
-rw-r--r--cmake/toolchains/aarch64-linux-gnu.cmake28
-rw-r--r--cmake/toolchains/android.cmake53
-rw-r--r--cmake/toolchains/arm-linux-gnueabihf.cmake29
-rw-r--r--codereview.settings4
-rw-r--r--examples/file_reader.cc186
-rw-r--r--examples/file_reader.h100
-rw-r--r--examples/file_reader_constants.cc23
-rw-r--r--examples/file_reader_constants.h39
-rw-r--r--examples/file_reader_factory.cc51
-rw-r--r--examples/file_reader_factory.h51
-rw-r--r--examples/file_reader_interface.h63
-rw-r--r--examples/file_writer.cc183
-rw-r--r--examples/file_writer.h102
-rw-r--r--examples/gav1_decode.cc452
-rw-r--r--examples/gav1_decode_cv_pixel_buffer_pool.cc278
-rw-r--r--examples/gav1_decode_cv_pixel_buffer_pool.h73
-rw-r--r--examples/ivf_parser.cc96
-rw-r--r--examples/ivf_parser.h57
-rw-r--r--examples/libgav1_examples.cmake63
-rw-r--r--examples/logging.h65
-rw-r--r--src/buffer_pool.cc218
-rw-r--r--src/buffer_pool.h399
-rw-r--r--src/decoder.cc119
-rw-r--r--src/decoder_impl.cc1661
-rw-r--r--src/decoder_impl.h266
-rw-r--r--src/decoder_settings.cc33
-rw-r--r--src/decoder_state.h89
-rw-r--r--src/dsp/arm/average_blend_neon.cc146
-rw-r--r--src/dsp/arm/average_blend_neon.h36
-rw-r--r--src/dsp/arm/cdef_neon.cc697
-rw-r--r--src/dsp/arm/cdef_neon.h38
-rw-r--r--src/dsp/arm/common_neon.h777
-rw-r--r--src/dsp/arm/convolve_neon.cc3105
-rw-r--r--src/dsp/arm/convolve_neon.h50
-rw-r--r--src/dsp/arm/distance_weighted_blend_neon.cc203
-rw-r--r--src/dsp/arm/distance_weighted_blend_neon.h39
-rw-r--r--src/dsp/arm/film_grain_neon.cc1188
-rw-r--r--src/dsp/arm/film_grain_neon.h47
-rw-r--r--src/dsp/arm/intra_edge_neon.cc301
-rw-r--r--src/dsp/arm/intra_edge_neon.h39
-rw-r--r--src/dsp/arm/intrapred_cfl_neon.cc479
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc926
-rw-r--r--src/dsp/arm/intrapred_filter_intra_neon.cc176
-rw-r--r--src/dsp/arm/intrapred_neon.cc1144
-rw-r--r--src/dsp/arm/intrapred_neon.h418
-rw-r--r--src/dsp/arm/intrapred_smooth_neon.cc616
-rw-r--r--src/dsp/arm/inverse_transform_neon.cc3128
-rw-r--r--src/dsp/arm/inverse_transform_neon.h52
-rw-r--r--src/dsp/arm/loop_filter_neon.cc1190
-rw-r--r--src/dsp/arm/loop_filter_neon.h53
-rw-r--r--src/dsp/arm/loop_restoration_neon.cc1901
-rw-r--r--src/dsp/arm/loop_restoration_neon.h40
-rw-r--r--src/dsp/arm/mask_blend_neon.cc444
-rw-r--r--src/dsp/arm/mask_blend_neon.h41
-rw-r--r--src/dsp/arm/motion_field_projection_neon.cc393
-rw-r--r--src/dsp/arm/motion_field_projection_neon.h39
-rw-r--r--src/dsp/arm/motion_vector_search_neon.cc267
-rw-r--r--src/dsp/arm/motion_vector_search_neon.h39
-rw-r--r--src/dsp/arm/obmc_neon.cc392
-rw-r--r--src/dsp/arm/obmc_neon.h38
-rw-r--r--src/dsp/arm/super_res_neon.cc166
-rw-r--r--src/dsp/arm/super_res_neon.h37
-rw-r--r--src/dsp/arm/warp_neon.cc453
-rw-r--r--src/dsp/arm/warp_neon.h37
-rw-r--r--src/dsp/arm/weight_mask_neon.cc463
-rw-r--r--src/dsp/arm/weight_mask_neon.h52
-rw-r--r--src/dsp/average_blend.cc101
-rw-r--r--src/dsp/average_blend.h47
-rw-r--r--src/dsp/cdef.cc306
-rw-r--r--src/dsp/cdef.h47
-rw-r--r--src/dsp/cdef.inc29
-rw-r--r--src/dsp/common.h82
-rw-r--r--src/dsp/constants.cc103
-rw-r--r--src/dsp/constants.h71
-rw-r--r--src/dsp/convolve.cc876
-rw-r--r--src/dsp/convolve.h49
-rw-r--r--src/dsp/convolve.inc50
-rw-r--r--src/dsp/distance_weighted_blend.cc101
-rw-r--r--src/dsp/distance_weighted_blend.h47
-rw-r--r--src/dsp/dsp.cc150
-rw-r--r--src/dsp/dsp.h910
-rw-r--r--src/dsp/film_grain.cc870
-rw-r--r--src/dsp/film_grain.h39
-rw-r--r--src/dsp/film_grain_common.h78
-rw-r--r--src/dsp/intra_edge.cc115
-rw-r--r--src/dsp/intra_edge.h48
-rw-r--r--src/dsp/intrapred.cc2911
-rw-r--r--src/dsp/intrapred.h49
-rw-r--r--src/dsp/inverse_transform.cc1636
-rw-r--r--src/dsp/inverse_transform.h47
-rw-r--r--src/dsp/inverse_transform.inc64
-rw-r--r--src/dsp/libgav1_dsp.cmake176
-rw-r--r--src/dsp/loop_filter.cc616
-rw-r--r--src/dsp/loop_filter.h47
-rw-r--r--src/dsp/loop_restoration.cc936
-rw-r--r--src/dsp/loop_restoration.h85
-rw-r--r--src/dsp/mask_blend.cc207
-rw-r--r--src/dsp/mask_blend.h49
-rw-r--r--src/dsp/motion_field_projection.cc138
-rw-r--r--src/dsp/motion_field_projection.h48
-rw-r--r--src/dsp/motion_vector_search.cc211
-rw-r--r--src/dsp/motion_vector_search.h49
-rw-r--r--src/dsp/obmc.cc125
-rw-r--r--src/dsp/obmc.h47
-rw-r--r--src/dsp/obmc.inc32
-rw-r--r--src/dsp/super_res.cc109
-rw-r--r--src/dsp/super_res.h47
-rw-r--r--src/dsp/warp.cc475
-rw-r--r--src/dsp/warp.h47
-rw-r--r--src/dsp/weight_mask.cc227
-rw-r--r--src/dsp/weight_mask.h47
-rw-r--r--src/dsp/x86/average_blend_sse4.cc156
-rw-r--r--src/dsp/x86/average_blend_sse4.h41
-rw-r--r--src/dsp/x86/cdef_sse4.cc728
-rw-r--r--src/dsp/x86/cdef_sse4.h45
-rw-r--r--src/dsp/x86/common_avx2.h138
-rw-r--r--src/dsp/x86/common_sse4.h265
-rw-r--r--src/dsp/x86/convolve_avx2.cc534
-rw-r--r--src/dsp/x86/convolve_avx2.h43
-rw-r--r--src/dsp/x86/convolve_sse4.cc2830
-rw-r--r--src/dsp/x86/convolve_sse4.h75
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.cc230
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.h41
-rw-r--r--src/dsp/x86/intra_edge_sse4.cc270
-rw-r--r--src/dsp/x86/intra_edge_sse4.h46
-rw-r--r--src/dsp/x86/intrapred_cfl_sse4.cc976
-rw-r--r--src/dsp/x86/intrapred_smooth_sse4.cc2662
-rw-r--r--src/dsp/x86/intrapred_sse4.cc3535
-rw-r--r--src/dsp/x86/intrapred_sse4.h1060
-rw-r--r--src/dsp/x86/inverse_transform_sse4.cc3086
-rw-r--r--src/dsp/x86/inverse_transform_sse4.h89
-rw-r--r--src/dsp/x86/loop_filter_sse4.cc2256
-rw-r--r--src/dsp/x86/loop_filter_sse4.h119
-rw-r--r--src/dsp/x86/loop_restoration_10bit_avx2.cc592
-rw-r--r--src/dsp/x86/loop_restoration_10bit_sse4.cc551
-rw-r--r--src/dsp/x86/loop_restoration_avx2.cc2902
-rw-r--r--src/dsp/x86/loop_restoration_avx2.h52
-rw-r--r--src/dsp/x86/loop_restoration_sse4.cc2549
-rw-r--r--src/dsp/x86/loop_restoration_sse4.h52
-rw-r--r--src/dsp/x86/mask_blend_sse4.cc447
-rw-r--r--src/dsp/x86/mask_blend_sse4.h60
-rw-r--r--src/dsp/x86/motion_field_projection_sse4.cc397
-rw-r--r--src/dsp/x86/motion_field_projection_sse4.h41
-rw-r--r--src/dsp/x86/motion_vector_search_sse4.cc262
-rw-r--r--src/dsp/x86/motion_vector_search_sse4.h41
-rw-r--r--src/dsp/x86/obmc_sse4.cc329
-rw-r--r--src/dsp/x86/obmc_sse4.h43
-rw-r--r--src/dsp/x86/super_res_sse4.cc166
-rw-r--r--src/dsp/x86/super_res_sse4.h38
-rw-r--r--src/dsp/x86/transpose_sse4.h307
-rw-r--r--src/dsp/x86/warp_sse4.cc525
-rw-r--r--src/dsp/x86/warp_sse4.h44
-rw-r--r--src/dsp/x86/weight_mask_sse4.cc464
-rw-r--r--src/dsp/x86/weight_mask_sse4.h104
-rw-r--r--src/film_grain.cc817
-rw-r--r--src/film_grain.h193
-rw-r--r--src/frame_buffer.cc151
-rw-r--r--src/frame_buffer_utils.h78
-rw-r--r--src/frame_scratch_buffer.h113
-rw-r--r--src/gav1/decoder.h148
-rw-r--r--src/gav1/decoder_buffer.h279
-rw-r--r--src/gav1/decoder_settings.h144
-rw-r--r--src/gav1/frame_buffer.h177
-rw-r--r--src/gav1/status_code.h118
-rw-r--r--src/gav1/symbol_visibility.h88
-rw-r--r--src/gav1/version.h71
-rw-r--r--src/inter_intra_masks.inc581
-rw-r--r--src/internal_frame_buffer_list.cc122
-rw-r--r--src/internal_frame_buffer_list.h81
-rw-r--r--src/libgav1_decoder.cmake157
-rw-r--r--src/loop_restoration_info.cc240
-rw-r--r--src/loop_restoration_info.h104
-rw-r--r--src/motion_vector.cc1001
-rw-r--r--src/motion_vector.h59
-rw-r--r--src/obu_parser.cc2885
-rw-r--r--src/obu_parser.h406
-rw-r--r--src/post_filter.h565
-rw-r--r--src/post_filter/cdef.cc660
-rw-r--r--src/post_filter/deblock.cc523
-rw-r--r--src/post_filter/deblock_thresholds.inc85
-rw-r--r--src/post_filter/loop_restoration.cc172
-rw-r--r--src/post_filter/post_filter.cc601
-rw-r--r--src/post_filter/super_res.cc199
-rw-r--r--src/prediction_mask.cc236
-rw-r--r--src/prediction_mask.h41
-rw-r--r--src/quantizer.cc269
-rw-r--r--src/quantizer.h74
-rw-r--r--src/quantizer_tables.inc3080
-rw-r--r--src/reconstruction.cc190
-rw-r--r--src/reconstruction.h54
-rw-r--r--src/residual_buffer_pool.cc142
-rw-r--r--src/residual_buffer_pool.h203
-rw-r--r--src/scan_tables.inc440
-rw-r--r--src/status_code.cc57
-rw-r--r--src/symbol_decoder_context.cc322
-rw-r--r--src/symbol_decoder_context.h301
-rw-r--r--src/symbol_decoder_context_cdfs.inc2509
-rw-r--r--src/threading_strategy.cc222
-rw-r--r--src/threading_strategy.h131
-rw-r--r--src/tile.h914
-rw-r--r--src/tile/bitstream/mode_info.cc1303
-rw-r--r--src/tile/bitstream/palette.cc319
-rw-r--r--src/tile/bitstream/partition.cc148
-rw-r--r--src/tile/bitstream/transform_size.cc222
-rw-r--r--src/tile/prediction.cc1361
-rw-r--r--src/tile/tile.cc2573
-rw-r--r--src/tile_scratch_buffer.cc26
-rw-r--r--src/tile_scratch_buffer.h160
-rw-r--r--src/utils/array_2d.h131
-rw-r--r--src/utils/bit_mask_set.h79
-rw-r--r--src/utils/bit_reader.cc117
-rw-r--r--src/utils/bit_reader.h49
-rw-r--r--src/utils/block_parameters_holder.cc107
-rw-r--r--src/utils/block_parameters_holder.h85
-rw-r--r--src/utils/blocking_counter.h97
-rw-r--r--src/utils/common.h534
-rw-r--r--src/utils/compiler_attributes.h181
-rw-r--r--src/utils/constants.cc874
-rw-r--r--src/utils/constants.h744
-rw-r--r--src/utils/cpu.cc84
-rw-r--r--src/utils/cpu.h107
-rw-r--r--src/utils/dynamic_buffer.h82
-rw-r--r--src/utils/entropy_decoder.cc1117
-rw-r--r--src/utils/entropy_decoder.h123
-rw-r--r--src/utils/executor.cc21
-rw-r--r--src/utils/executor.h36
-rw-r--r--src/utils/libgav1_utils.cmake72
-rw-r--r--src/utils/logging.cc65
-rw-r--r--src/utils/logging.h85
-rw-r--r--src/utils/memory.h237
-rw-r--r--src/utils/parameter_tree.cc133
-rw-r--r--src/utils/parameter_tree.h113
-rw-r--r--src/utils/queue.h105
-rw-r--r--src/utils/raw_bit_reader.cc224
-rw-r--r--src/utils/raw_bit_reader.h78
-rw-r--r--src/utils/reference_info.h92
-rw-r--r--src/utils/segmentation.cc31
-rw-r--r--src/utils/segmentation.h32
-rw-r--r--src/utils/segmentation_map.cc49
-rw-r--r--src/utils/segmentation_map.h71
-rw-r--r--src/utils/stack.h59
-rw-r--r--src/utils/threadpool.cc323
-rw-r--r--src/utils/threadpool.h167
-rw-r--r--src/utils/types.h525
-rw-r--r--src/utils/unbounded_queue.h245
-rw-r--r--src/utils/vector.h352
-rw-r--r--src/version.cc39
-rw-r--r--src/warp_prediction.cc244
-rw-r--r--src/warp_prediction.h40
-rw-r--r--src/yuv_buffer.cc201
-rw-r--r--src/yuv_buffer.h183
-rw-r--r--tests/fuzzer/decoder_fuzzer.cc87
-rw-r--r--tests/fuzzer/decoder_fuzzer_frame_parallel.cc139
-rw-r--r--tests/fuzzer/fuzzer_temp_file.h148
-rw-r--r--tests/fuzzer/obu_parser_fuzzer.cc89
273 files changed, 102925 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..87ccf24
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/build
+/third_party
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..d92ea0a
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,6 @@
+# This is the list of libgav1 authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder. To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..5d00ae6
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,126 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# libgav1 requires modern CMake.
+cmake_minimum_required(VERSION 3.7.1 FATAL_ERROR)
+
+# libgav1 requires C++11.
+set(CMAKE_CXX_STANDARD 11)
+set(ABSL_CXX_STANDARD 11)
+
+project(libgav1 CXX)
+
+set(libgav1_root "${CMAKE_CURRENT_SOURCE_DIR}")
+set(libgav1_build "${CMAKE_BINARY_DIR}")
+
+if("${libgav1_root}" STREQUAL "${libgav1_build}")
+ message(
+ FATAL_ERROR
+ "Building from within the libgav1 source tree is not supported.\n"
+ "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+ "$ mkdir -p ../libgav1_build\n" "$ cd ../libgav1_build\n"
+ "And re-run CMake from the libgav1_build directory.")
+endif()
+
+set(libgav1_examples "${libgav1_root}/examples")
+set(libgav1_source "${libgav1_root}/src")
+
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+ "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING
+ "Enables avx2 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+ VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+ "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(
+ NAME LIBGAV1_VERBOSE HELPSTRING
+ "Enables verbose build system output. Higher numbers are more verbose." VALUE
+ OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Release)
+endif()
+
+libgav1_optimization_detect()
+libgav1_set_build_definitions()
+libgav1_set_cxx_flags()
+libgav1_configure_sanitizer()
+
+# Supported bit depth.
+libgav1_track_configuration_variable(LIBGAV1_MAX_BITDEPTH)
+
+# C++ and linker flags.
+libgav1_track_configuration_variable(LIBGAV1_CXX_FLAGS)
+libgav1_track_configuration_variable(LIBGAV1_EXE_LINKER_FLAGS)
+
+# Sanitizer integration.
+libgav1_track_configuration_variable(LIBGAV1_SANITIZE)
+
+# Generated source file directory.
+libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+
+# Controls use of std::mutex and absl::Mutex in ThreadPool.
+libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+
+if(LIBGAV1_VERBOSE)
+ libgav1_dump_cmake_flag_variables()
+ libgav1_dump_tracked_configuration_variables()
+ libgav1_dump_options()
+endif()
+
+set(libgav1_abseil_build "${libgav1_build}/abseil")
+set(libgav1_gtest_build "${libgav1_build}/gtest")
+
+# Compiler/linker flags must be lists, but come in from the environment as
+# strings. Break them up:
+if(NOT "${LIBGAV1_CXX_FLAGS}" STREQUAL "")
+ separate_arguments(LIBGAV1_CXX_FLAGS)
+endif()
+if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
+ separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
+endif()
+
+add_subdirectory("${libgav1_root}/third_party/abseil-cpp"
+ "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+
+libgav1_reset_target_lists()
+libgav1_add_dsp_targets()
+libgav1_add_decoder_targets()
+libgav1_add_examples_targets()
+libgav1_add_utils_targets()
+libgav1_setup_install_target()
+
+if(LIBGAV1_VERBOSE)
+ libgav1_dump_cmake_flag_variables()
+ libgav1_dump_tracked_configuration_variables()
+ libgav1_dump_options()
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..69140ff
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,27 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
+https://chromium-review.googlesource.com for this purpose.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8ab8eab
--- /dev/null
+++ b/README.md
@@ -0,0 +1,168 @@
+# libgav1 -- an AV1 decoder
+
+libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More
+information on the AV1 video format can be found at
+[aomedia.org](https://aomedia.org).
+
+[TOC]
+
+## Building
+
+### Prerequisites
+
+1. A C++11 compiler. gcc 6+, clang 7+ or Microsoft Visual Studio 2017+ are
+ recommended.
+
+2. [CMake >= 3.7.1](https://cmake.org/download/)
+
+3. [Abseil](https://abseil.io)
+
+ From within the libgav1 directory:
+
+ ```shell
+ $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+ ```
+
+### Compile
+
+```shell
+ $ mkdir build && cd build
+ $ cmake -G "Unix Makefiles" ..
+ $ make
+```
+
+Configuration options:
+
+* `LIBGAV1_MAX_BITDEPTH`: defines the maximum supported bitdepth (8, 10;
+ default: 10).
+* `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS`: define to a non-zero value to disable
+ [symbol reduction](#symbol-reduction) in an optimized build to keep all
+ versions of dsp functions available. Automatically defined in
+ `src/dsp/dsp.h` if unset.
+* `LIBGAV1_ENABLE_AVX2`: define to a non-zero value to enable avx2
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+* `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+* `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset. Note
+ setting this to 0 will also disable AVX2.
+* `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
+ Automatically defined in `src/utils/logging.h` if unset.
+* `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
+ the examples. Automatically defined in `examples/logging.h` if unset.
+* `LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK`: define to 1 to enable transform
+ coefficient range checks.
+* `LIBGAV1_LOG_LEVEL`: controls the maximum allowed log level, see `enum
+ LogSeverity` in `src/utils/logging.h`. Automatically defined in
+ `src/utils/logging.cc` if unset.
+* `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
+ absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
+ dependency from the core library. Automatically defined in
+ `src/utils/threadpool.h` if unset.
+* `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+ allowed to create. Has to be an integer > 0. Otherwise this is ignored.
+ The default value is 128.
+* `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
+ is used to determine when to use frame parallel decoding. Frame parallel
+ decoding will be used if |threads| > |tile_count| * this multiplier. Has to
+ be an integer > 0. The default value is 4. This is an advanced setting
+ intended for testing purposes.
+
+For additional options see:
+
+```shell
+ $ cmake .. -LH
+```
+
+## Testing
+
+* `gav1_decode` can be used to decode IVF files, see `gav1_decode --help` for
+ options. Note: tools like [FFmpeg](https://ffmpeg.org) can be used to
+ convert other container formats to IVF.
+
+## Development
+
+### Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to submit patches.
+
+### Style
+
+libgav1 follows the
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html) with
+formatting enforced by `clang-format`.
+
+### Comments
+
+Comments of the form '`// X.Y(.Z).`', '`Section X.Y(.Z).`' or '`... in the
+spec`' reference the relevant section(s) in the
+[AV1 specification](http://aomediacodec.github.io/av1-spec/av1-spec.pdf).
+
+### DSP structure
+
+* `src/dsp/dsp.cc` defines the main entry point: `libgav1::dsp::DspInit()`.
+ This handles cpu-detection and initializing each logical unit which populate
+ `libgav1::dsp::Dsp` function tables.
+* `src/dsp/dsp.h` contains function and type definitions for all logical units
+ (e.g., intra-predictors)
+* `src/utils/cpu.h` contains definitions for cpu-detection
+* base implementations are located in `src/dsp/*.{h,cc}` with platform
+ specific optimizations in sub-folders
+* unit tests define `DISABLED_Speed` test(s) to allow timing of individual
+ functions
+
+#### Symbol reduction
+
+Based on the build configuration unneeded lesser optimizations are removed using
+a hierarchical include and define system. Each logical unit in `src/dsp` should
+include all platform specific headers in descending order to allow higher level
+optimizations to disable lower level ones. See `src/dsp/loop_filter.h` for an
+example.
+
+Each function receives a new define which can be checked in platform specific
+headers. The format is: `LIBGAV1_<Dsp-table>_FunctionName` or
+`LIBGAV1_<Dsp-table>_[sub-table-index1][...-indexN]`, e.g.,
+`LIBGAV1_Dsp8bpp_AverageBlend`,
+`LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc`. The Dsp-table name is of
+the form `Dsp<bitdepth>bpp` e.g. `Dsp10bpp` for bitdepth == 10 (bpp stands for
+bits per pixel). The indices correspond to enum values used as lookups with
+leading 'k' removed. Platform specific headers then should first check if the
+symbol is defined and if not set the value to the corresponding
+`LIBGAV1_CPU_<arch>` value from `src/utils/cpu.h`.
+
+```
+ #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+ #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+ #endif
+```
+
+Within each module the code should check if the symbol is defined to its
+specific architecture or forced via `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS` before
+defining the function. The `DSP_ENABLED_(8|10)BPP_*` macros are available to
+simplify this check for optimized code.
+
+```
+ #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ ...
+
+ // In unoptimized code use the following structure; there's no equivalent
+ // define for LIBGAV1_CPU_C as it would require duplicating the function
+ // defines used in optimized code for only a small benefit to this
+ // boilerplate.
+ #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ ...
+ #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+ ...
+```
+
+## Bugs
+
+Please report all bugs to the issue tracker:
+https://issuetracker.google.com/issues/new?component=750480&template=1355007
+
+## Discussion
+
+Email: gav1-devel@googlegroups.com
+
+Web: https://groups.google.com/forum/#!forum/gav1-devel
diff --git a/cmake/libgav1-config.cmake.template b/cmake/libgav1-config.cmake.template
new file mode 100644
index 0000000..dc253d3
--- /dev/null
+++ b/cmake/libgav1-config.cmake.template
@@ -0,0 +1,2 @@
+set(LIBGAV1_INCLUDE_DIRS "@LIBGAV1_INCLUDE_DIRS@")
+set(LIBGAV1_LIBRARIES "gav1")
diff --git a/cmake/libgav1.pc.template b/cmake/libgav1.pc.template
new file mode 100644
index 0000000..c571a43
--- /dev/null
+++ b/cmake/libgav1.pc.template
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: @PROJECT_NAME@
+Description: AV1 decoder library (@LIBGAV1_MAX_BITDEPTH@-bit).
+Version: @LIBGAV1_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lgav1
+Libs.private: @CMAKE_THREAD_LIBS_INIT@
diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake
new file mode 100644
index 0000000..b170e7e
--- /dev/null
+++ b/cmake/libgav1_build_definitions.cmake
@@ -0,0 +1,150 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ 1)
+
+macro(libgav1_set_build_definitions)
+ string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+
+ libgav1_load_version_info()
+ set(LIBGAV1_SOVERSION 0)
+
+ list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
+ "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
+ list(APPEND libgav1_gtest_include_paths
+ "third_party/googletest/googlemock/include"
+ "third_party/googletest/googletest/include"
+ "third_party/googletest/googletest")
+ list(APPEND libgav1_test_include_paths ${libgav1_include_paths}
+ ${libgav1_gtest_include_paths})
+ list(APPEND libgav1_defines "LIBGAV1_CMAKE=1"
+ "LIBGAV1_FLAGS_SRCDIR=\"${libgav1_root}\""
+ "LIBGAV1_FLAGS_TMPDIR=\"/tmp\"")
+
+ if(MSVC OR WIN32)
+ list(APPEND libgav1_defines "_CRT_SECURE_NO_DEPRECATE=1" "NOMINMAX=1")
+ endif()
+
+ if(ANDROID)
+ if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
+ set(CMAKE_ANDROID_ARM_MODE ON)
+ endif()
+
+ if(build_type_lowercase MATCHES "rel")
+ list(APPEND libgav1_base_cxx_flags "-fno-stack-protector")
+ endif()
+ endif()
+
+ list(APPEND libgav1_base_cxx_flags "-Wall" "-Wextra" "-Wmissing-declarations"
+ "-Wno-sign-compare" "-fvisibility=hidden"
+ "-fvisibility-inlines-hidden")
+
+ if(BUILD_SHARED_LIBS)
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+ set(libgav1_dependency libgav1_shared)
+ else()
+ set(libgav1_dependency libgav1_static)
+ endif()
+
+ list(APPEND libgav1_clang_cxx_flags "-Wextra-semi" "-Wmissing-prototypes"
+ "-Wshorten-64-to-32")
+
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+ # Quiet warnings in copy-list-initialization where {} elision has always
+ # been allowed.
+ list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+ endif()
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8)
+ list(APPEND libgav1_clang_cxx_flags "-Wextra-semi-stmt")
+ endif()
+ endif()
+
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
+ # Quiet warnings due to potential snprintf() truncation in threadpool.cc.
+ list(APPEND libgav1_base_cxx_flags "-Wno-format-truncation")
+
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7")
+ # Quiet gcc 6 vs 7 abi warnings:
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+ list(APPEND libgav1_base_cxx_flags "-Wno-psabi")
+ list(APPEND ABSL_GCC_FLAGS "-Wno-psabi")
+ endif()
+ endif()
+ endif()
+
+ if(build_type_lowercase MATCHES "rel")
+ # TODO(tomfinegan): this value is only a concern for the core library and
+ # can be made smaller if the test targets are avoided.
+ list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608")
+ endif()
+
+ list(APPEND libgav1_msvc_cxx_flags
+ # Warning level 3.
+ "/W3"
+ # Disable warning C4018:
+ # '<comparison operator>' signed/unsigned mismatch
+ "/wd4018"
+ # Disable warning C4244:
+ # 'argument': conversion from '<double/int>' to
+ # '<float/smaller int type>', possible loss of data
+ "/wd4244"
+ # Disable warning C4267:
+ # '=': conversion from '<double/int>' to
+ # '<float/smaller int type>', possible loss of data
+ "/wd4267"
+ # Disable warning C4309:
+ # 'argument': truncation of constant value
+ "/wd4309"
+ # Disable warning C4551:
+ # function call missing argument list
+ "/wd4551")
+
+ if(BUILD_SHARED_LIBS)
+ list(APPEND libgav1_msvc_cxx_flags
+ # Disable warning C4251:
+ # 'libgav1::DecoderImpl class member' needs to have
+ # dll-interface to be used by clients of class
+ # 'libgav1::Decoder'.
+ "/wd4251")
+ endif()
+
+ if(NOT LIBGAV1_MAX_BITDEPTH)
+ set(LIBGAV1_MAX_BITDEPTH 10)
+ elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10)
+ libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.")
+ endif()
+
+ list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
+
+ if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+ if(NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 0
+ AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 1)
+ libgav1_die("LIBGAV1_THREADPOOL_USE_STD_MUTEX must be 0 or 1.")
+ endif()
+
+ list(APPEND libgav1_defines
+ "LIBGAV1_THREADPOOL_USE_STD_MUTEX=${LIBGAV1_THREADPOOL_USE_STD_MUTEX}")
+ endif()
+
+ # Source file names ending in these suffixes will have the appropriate
+ # compiler flags added to their compile commands to enable intrinsics.
+ set(libgav1_avx2_source_file_suffix "avx2.cc")
+ set(libgav1_neon_source_file_suffix "neon.cc")
+ set(libgav1_sse4_source_file_suffix "sse4.cc")
+endmacro()
diff --git a/cmake/libgav1_cpu_detection.cmake b/cmake/libgav1_cpu_detection.cmake
new file mode 100644
index 0000000..e17e27c
--- /dev/null
+++ b/cmake/libgav1_cpu_detection.cmake
@@ -0,0 +1,49 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ 1)
+
+# Detect optimizations available for the current target CPU.
+macro(libgav1_optimization_detect)
+ if(LIBGAV1_ENABLE_OPTIMIZATIONS)
+ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+ if(cpu_lowercase MATCHES "^arm|^aarch64")
+ set(libgav1_have_neon ON)
+ elseif(cpu_lowercase MATCHES "^x86|amd64")
+ set(libgav1_have_avx2 ON)
+ set(libgav1_have_sse4 ON)
+ endif()
+ endif()
+
+ if(libgav1_have_avx2 AND LIBGAV1_ENABLE_AVX2)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=0")
+ endif()
+
+ if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=0")
+ endif()
+
+ if(libgav1_have_sse4 AND LIBGAV1_ENABLE_SSE4_1)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=0")
+ endif()
+endmacro()
diff --git a/cmake/libgav1_flags.cmake b/cmake/libgav1_flags.cmake
new file mode 100644
index 0000000..2d8d9a6
--- /dev/null
+++ b/cmake/libgav1_flags.cmake
@@ -0,0 +1,251 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ 1)
+
+include(CheckCXXCompilerFlag)
+include(CheckCXXSourceCompiles)
+
+# Adds compiler flags specified by FLAGS to the sources specified by SOURCES:
+#
+# libgav1_set_compiler_flags_for_sources(SOURCES <sources> FLAGS <flags>)
+macro(libgav1_set_compiler_flags_for_sources)
+ unset(compiler_SOURCES)
+ unset(compiler_FLAGS)
+ unset(optional_args)
+ unset(single_value_args)
+ set(multi_value_args SOURCES FLAGS)
+ cmake_parse_arguments(compiler "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (compiler_SOURCES AND compiler_FLAGS))
+ libgav1_die("libgav1_set_compiler_flags_for_sources: SOURCES and "
+ "FLAGS required.")
+ endif()
+
+ set_source_files_properties(${compiler_SOURCES} PROPERTIES COMPILE_FLAGS
+ ${compiler_FLAGS})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ foreach(source ${compiler_SOURCES})
+ foreach(flag ${compiler_FLAGS})
+ message("libgav1_set_compiler_flags_for_sources: source:${source} "
+ "flag:${flag}")
+ endforeach()
+ endforeach()
+ endif()
+endmacro()
+
+# Tests compiler flags stored in list(s) specified by FLAG_LIST_VAR_NAMES, adds
+# flags to $LIBGAV1_CXX_FLAGS when tests pass. Terminates configuration if
+# FLAG_REQUIRED is specified and any flag check fails.
+#
+# ~~~
+# libgav1_test_cxx_flag(<FLAG_LIST_VAR_NAMES <flag list variable(s)>>
+# [FLAG_REQUIRED])
+# ~~~
+macro(libgav1_test_cxx_flag)
+ unset(cxx_test_FLAG_LIST_VAR_NAMES)
+ unset(cxx_test_FLAG_REQUIRED)
+ unset(single_value_args)
+ set(optional_args FLAG_REQUIRED)
+ set(multi_value_args FLAG_LIST_VAR_NAMES)
+ cmake_parse_arguments(cxx_test "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT cxx_test_FLAG_LIST_VAR_NAMES)
+ libgav1_die("libgav1_test_cxx_flag: FLAG_LIST_VAR_NAMES required")
+ endif()
+
+ unset(cxx_flags)
+ foreach(list_var ${cxx_test_FLAG_LIST_VAR_NAMES})
+ if(LIBGAV1_VERBOSE)
+ message("libgav1_test_cxx_flag: adding ${list_var} to cxx_flags")
+ endif()
+ list(APPEND cxx_flags ${${list_var}})
+ endforeach()
+
+ if(LIBGAV1_VERBOSE)
+ message("CXX test: all flags: ${cxx_flags}")
+ endif()
+
+ unset(all_cxx_flags)
+ list(APPEND all_cxx_flags ${LIBGAV1_CXX_FLAGS} ${cxx_flags})
+
+ # Turn off output from check_cxx_source_compiles. Print status directly
+ # instead since the logging messages from check_cxx_source_compiles can be
+ # quite confusing.
+ set(CMAKE_REQUIRED_QUIET TRUE)
+
+ # Run the actual compile test.
+ unset(libgav1_all_cxx_flags_pass CACHE)
+ message("--- Running combined CXX flags test, flags: ${all_cxx_flags}")
+ check_cxx_compiler_flag("${all_cxx_flags}" libgav1_all_cxx_flags_pass)
+
+ if(cxx_test_FLAG_REQUIRED AND NOT libgav1_all_cxx_flags_pass)
+ libgav1_die("Flag test failed for required flag(s): "
+ "${all_cxx_flags} and FLAG_REQUIRED specified.")
+ endif()
+
+ if(libgav1_all_cxx_flags_pass)
+ # Test passed: update the global flag list used by the libgav1 target
+ # creation wrappers.
+ set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+ list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+
+ if(LIBGAV1_VERBOSE)
+ message("LIBGAV1_CXX_FLAGS=${LIBGAV1_CXX_FLAGS}")
+ endif()
+
+ message("--- Passed combined CXX flags test")
+ else()
+ message("--- Failed combined CXX flags test, testing flags individually.")
+
+ if(cxx_flags)
+ message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
+ foreach(cxx_flag ${cxx_flags})
+ # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal
+ # variable at parent scope while check_cxx_source_compiles() continues
+ # to set an internal cache variable, so we unset both to avoid the
+ # failure / success state persisting between checks. See
+ # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+ unset(cxx_flag_test_passed)
+ unset(cxx_flag_test_passed CACHE)
+ message("--- Testing flag: ${cxx_flag}")
+ check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
+
+ if(cxx_flag_test_passed)
+ message("--- Passed test for ${cxx_flag}")
+ else()
+ list(REMOVE_ITEM cxx_flags ${cxx_flag})
+ message("--- Failed test for ${cxx_flag}, flag removed.")
+ endif()
+ endforeach()
+
+ set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+ endif()
+ endif()
+
+ if(LIBGAV1_CXX_FLAGS)
+ list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+ endif()
+endmacro()
+
+# Tests executable linker flags stored in list specified by FLAG_LIST_VAR_NAME,
+# adds flags to $LIBGAV1_EXE_LINKER_FLAGS when test passes. Terminates
+# configuration when flag check fails. libgav1_set_cxx_flags() must be called
+# before calling this macro because it assumes $LIBGAV1_CXX_FLAGS contains only
+# valid CXX flags.
+#
+# libgav1_test_exe_linker_flag(<FLAG_LIST_VAR_NAME <flag list variable)>)
+macro(libgav1_test_exe_linker_flag)
+ unset(link_FLAG_LIST_VAR_NAME)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args FLAG_LIST_VAR_NAME)
+ cmake_parse_arguments(link "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT link_FLAG_LIST_VAR_NAME)
+ libgav1_die("libgav1_test_link_flag: FLAG_LIST_VAR_NAME required")
+ endif()
+
+ libgav1_set_and_stringify(DEST linker_flags SOURCE_VARS
+ ${link_FLAG_LIST_VAR_NAME})
+
+ if(LIBGAV1_VERBOSE)
+ message("EXE LINKER test: all flags: ${linker_flags}")
+ endif()
+
+ # Tests of $LIBGAV1_CXX_FLAGS have already passed. Include them with the
+ # linker test.
+ libgav1_set_and_stringify(DEST CMAKE_REQUIRED_FLAGS SOURCE_VARS
+ LIBGAV1_CXX_FLAGS)
+
+ # Cache the global exe linker flags.
+ if(CMAKE_EXE_LINKER_FLAGS)
+ set(cached_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
+ libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE
+ ${linker_flags})
+ endif()
+
+ libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE ${linker_flags}
+ ${CMAKE_EXE_LINKER_FLAGS})
+
+ # Turn off output from check_cxx_source_compiles. Print status directly
+ # instead since the logging messages from check_cxx_source_compiles can be
+ # quite confusing.
+ set(CMAKE_REQUIRED_QUIET TRUE)
+
+ message("--- Running EXE LINKER test for flags: ${linker_flags}")
+
+ unset(linker_flag_test_passed CACHE)
+ set(libgav1_cxx_main "\nint main() { return 0; }")
+ check_cxx_source_compiles("${libgav1_cxx_main}" linker_flag_test_passed)
+
+ if(NOT linker_flag_test_passed)
+ libgav1_die("EXE LINKER test failed.")
+ endif()
+
+ message("--- Passed EXE LINKER flag test.")
+
+ # Restore cached global exe linker flags.
+ if(cached_CMAKE_EXE_LINKER_FLAGS)
+ set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS)
+ else()
+ unset(CMAKE_EXE_LINKER_FLAGS)
+ endif()
+endmacro()
+
+# Runs the libgav1 compiler tests. This macro builds up the list of list var(s)
+# that is passed to libgav1_test_cxx_flag().
+#
+# Note: libgav1_set_build_definitions() must be called before this macro.
+macro(libgav1_set_cxx_flags)
+ unset(cxx_flag_lists)
+
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
+ endif()
+
+ # Append clang flags after the base set to allow -Wno* overrides to take
+ # effect. Some of the base flags may enable a large set of warnings, e.g.,
+ # -Wall.
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+ endif()
+
+ if(MSVC)
+ list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
+ endif()
+
+ if(LIBGAV1_VERBOSE)
+ if(cxx_flag_lists)
+ libgav1_set_and_stringify(DEST cxx_flags SOURCE_VARS ${cxx_flag_lists})
+ message("libgav1_set_cxx_flags: internal CXX flags: ${cxx_flags}")
+ endif()
+ endif()
+
+ if(LIBGAV1_CXX_FLAGS)
+ list(APPEND cxx_flag_lists LIBGAV1_CXX_FLAGS)
+ if(LIBGAV1_VERBOSE)
+ message("libgav1_set_cxx_flags: user CXX flags: ${LIBGAV1_CXX_FLAGS}")
+ endif()
+ endif()
+
+ libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
+endmacro()
diff --git a/cmake/libgav1_helpers.cmake b/cmake/libgav1_helpers.cmake
new file mode 100644
index 0000000..76d8d67
--- /dev/null
+++ b/cmake/libgav1_helpers.cmake
@@ -0,0 +1,134 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
+
+# Kills build generation using message(FATAL_ERROR) and outputs all data passed
+# to the console via use of $ARGN.
+macro(libgav1_die)
+ message(FATAL_ERROR ${ARGN})
+endmacro()
+
+# Converts semi-colon delimited list variable(s) to string. Output is written to
+# variable supplied via the DEST parameter. Input is from an expanded variable
+# referenced by SOURCE and/or variable(s) referenced by SOURCE_VARS.
+macro(libgav1_set_and_stringify)
+ set(optional_args)
+ set(single_value_args DEST SOURCE_VAR)
+ set(multi_value_args SOURCE SOURCE_VARS)
+ cmake_parse_arguments(sas "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT sas_DEST OR NOT (sas_SOURCE OR sas_SOURCE_VARS))
+ libgav1_die("libgav1_set_and_stringify: DEST and at least one of SOURCE "
+ "SOURCE_VARS required.")
+ endif()
+
+ unset(${sas_DEST})
+
+ if(sas_SOURCE)
+ # $sas_SOURCE is one or more expanded variables, just copy the values to
+ # $sas_DEST.
+ set(${sas_DEST} "${sas_SOURCE}")
+ endif()
+
+ if(sas_SOURCE_VARS)
+ # $sas_SOURCE_VARS is one or more variable names. Each iteration expands a
+ # variable and appends it to $sas_DEST.
+ foreach(source_var ${sas_SOURCE_VARS})
+ set(${sas_DEST} "${${sas_DEST}} ${${source_var}}")
+ endforeach()
+
+ # Because $sas_DEST can be empty when entering this scope leading whitespace
+ # can be introduced to $sas_DEST on the first iteration of the above loop.
+ # Remove it:
+ string(STRIP "${${sas_DEST}}" ${sas_DEST})
+ endif()
+
+ # Lists in CMake are simply semicolon delimited strings, so stringification is
+ # just a find and replace of the semicolon.
+ string(REPLACE ";" " " ${sas_DEST} "${${sas_DEST}}")
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("libgav1_set_and_stringify: ${sas_DEST}=${${sas_DEST}}")
+ endif()
+endmacro()
+
+# Creates a dummy source file in $LIBGAV1_GENERATED_SOURCES_DIRECTORY and adds
+# it to the specified target. Optionally adds its path to a list variable.
+#
+# libgav1_create_dummy_source_file(<TARGET <target> BASENAME <basename of file>>
+# [LISTVAR <list variable>])
+macro(libgav1_create_dummy_source_file)
+ set(optional_args)
+ set(single_value_args TARGET BASENAME LISTVAR)
+ set(multi_value_args)
+ cmake_parse_arguments(cdsf "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT cdsf_TARGET OR NOT cdsf_BASENAME)
+ libgav1_die(
+ "libgav1_create_dummy_source_file: TARGET and BASENAME required.")
+ endif()
+
+ if(NOT LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+ set(LIBGAV1_GENERATED_SOURCES_DIRECTORY "${libgav1_build}/gen_src")
+ endif()
+
+ set(dummy_source_dir "${LIBGAV1_GENERATED_SOURCES_DIRECTORY}")
+ set(dummy_source_file
+ "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
+ set(dummy_source_code
+ "// Generated file. DO NOT EDIT!\n"
+ "// C++ source file created for target ${cdsf_TARGET}. \n"
+ "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n"
+ "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
+ file(WRITE "${dummy_source_file}" "${dummy_source_code}")
+
+ target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
+
+ if(cdsf_LISTVAR)
+ list(APPEND ${cdsf_LISTVAR} "${dummy_source_file}")
+ endif()
+endmacro()
+
+# Loads the version components from $libgav1_source/gav1/version.h and sets the
+# corresponding CMake variables:
+# - LIBGAV1_MAJOR_VERSION
+# - LIBGAV1_MINOR_VERSION
+# - LIBGAV1_PATCH_VERSION
+# - LIBGAV1_VERSION, which is:
+# - $LIBGAV1_MAJOR_VERSION.$LIBGAV1_MINOR_VERSION.$LIBGAV1_PATCH_VERSION
+macro(libgav1_load_version_info)
+ file(STRINGS "${libgav1_source}/gav1/version.h" version_file_strings)
+ foreach(str ${version_file_strings})
+ if(str MATCHES "#define LIBGAV1_")
+ if(str MATCHES "#define LIBGAV1_MAJOR_VERSION ")
+ string(REPLACE "#define LIBGAV1_MAJOR_VERSION " "" LIBGAV1_MAJOR_VERSION
+ "${str}")
+ elseif(str MATCHES "#define LIBGAV1_MINOR_VERSION ")
+ string(REPLACE "#define LIBGAV1_MINOR_VERSION " "" LIBGAV1_MINOR_VERSION
+ "${str}")
+ elseif(str MATCHES "#define LIBGAV1_PATCH_VERSION ")
+ string(REPLACE "#define LIBGAV1_PATCH_VERSION " "" LIBGAV1_PATCH_VERSION
+ "${str}")
+ endif()
+ endif()
+ endforeach()
+ set(LIBGAV1_VERSION "${LIBGAV1_MAJOR_VERSION}.${LIBGAV1_MINOR_VERSION}")
+ set(LIBGAV1_VERSION "${LIBGAV1_VERSION}.${LIBGAV1_PATCH_VERSION}")
+endmacro()
diff --git a/cmake/libgav1_install.cmake b/cmake/libgav1_install.cmake
new file mode 100644
index 0000000..b7f6006
--- /dev/null
+++ b/cmake/libgav1_install.cmake
@@ -0,0 +1,60 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ 1)
+
+# Sets up the Libgav1 install targets. Must be called after the static library
+# target is created.
+macro(libgav1_setup_install_target)
+ if(NOT (MSVC OR XCODE))
+ include(GNUInstallDirs)
+
+ # pkg-config: libgav1.pc
+ set(prefix "${CMAKE_INSTALL_PREFIX}")
+ set(exec_prefix "\${prefix}")
+ set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
+ set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+ set(libgav1_lib_name "libgav1")
+
+ configure_file("${libgav1_root}/cmake/libgav1.pc.template"
+ "${libgav1_build}/libgav1.pc" @ONLY NEWLINE_STYLE UNIX)
+ install(FILES "${libgav1_build}/libgav1.pc"
+ DESTINATION "${prefix}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+ # CMake config: libgav1-config.cmake
+ set(LIBGAV1_INCLUDE_DIRS "${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+ configure_file("${libgav1_root}/cmake/libgav1-config.cmake.template"
+ "${libgav1_build}/libgav1-config.cmake" @ONLY
+ NEWLINE_STYLE UNIX)
+ install(
+ FILES "${libgav1_build}/libgav1-config.cmake"
+ DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/cmake")
+
+ install(
+ FILES ${libgav1_api_includes}
+ DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
+
+ install(TARGETS gav1_decode DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+ install(TARGETS libgav1_static DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+ if(BUILD_SHARED_LIBS)
+ install(TARGETS libgav1_shared DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+ endif()
+ endif()
+endmacro()
diff --git a/cmake/libgav1_intrinsics.cmake b/cmake/libgav1_intrinsics.cmake
new file mode 100644
index 0000000..a2e9ddb
--- /dev/null
+++ b/cmake/libgav1_intrinsics.cmake
@@ -0,0 +1,135 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ 1)
+
+# Returns the compiler flag for the SIMD intrinsics suffix specified by the
+# SUFFIX argument via the variable specified by the VARIABLE argument:
+# libgav1_get_intrinsics_flag_for_suffix(SUFFIX <suffix> VARIABLE <var name>)
+macro(libgav1_get_intrinsics_flag_for_suffix)
+ unset(intrinsics_SUFFIX)
+ unset(intrinsics_VARIABLE)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args SUFFIX VARIABLE)
+ cmake_parse_arguments(intrinsics "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (intrinsics_SUFFIX AND intrinsics_VARIABLE))
+ message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: SUFFIX and "
+ "VARIABLE required.")
+ endif()
+
+ if(intrinsics_SUFFIX MATCHES "neon")
+ if(NOT MSVC)
+ set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
+ endif()
+ elseif(intrinsics_SUFFIX MATCHES "avx2")
+ if(MSVC)
+ set(${intrinsics_VARIABLE} "/arch:AVX2")
+ else()
+ set(${intrinsics_VARIABLE} "-mavx2")
+ endif()
+ elseif(intrinsics_SUFFIX MATCHES "sse4")
+ if(NOT MSVC)
+ set(${intrinsics_VARIABLE} "-msse4.1")
+ endif()
+ else()
+ message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: Unknown "
+ "instrinics suffix: ${intrinsics_SUFFIX}")
+ endif()
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("libgav1_get_intrinsics_flag_for_suffix: "
+ "suffix:${intrinsics_SUFFIX} flag:${${intrinsics_VARIABLE}}")
+ endif()
+endmacro()
+
+# Processes source files specified by SOURCES and adds intrinsics flags as
+# necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
+#
+# Detects requirement for intrinsics flags using source file name suffix.
+# Currently supports AVX2 and SSE4.1.
+macro(libgav1_process_intrinsics_sources)
+ unset(arg_TARGET)
+ unset(arg_SOURCES)
+ unset(optional_args)
+ set(single_value_args TARGET)
+ set(multi_value_args SOURCES)
+ cmake_parse_arguments(arg "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+ if(NOT (arg_TARGET AND arg_SOURCES))
+ message(FATAL_ERROR "libgav1_process_intrinsics_sources: TARGET and "
+ "SOURCES required.")
+ endif()
+
+ if(LIBGAV1_ENABLE_AVX2 AND libgav1_have_avx2)
+ unset(avx2_sources)
+ list(APPEND avx2_sources ${arg_SOURCES})
+
+ list(FILTER avx2_sources INCLUDE REGEX
+ "${libgav1_avx2_source_file_suffix}$")
+
+ if(avx2_sources)
+ unset(avx2_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_avx2_source_file_suffix}
+ VARIABLE avx2_flags)
+ if(avx2_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${avx2_sources} FLAGS
+ ${avx2_flags})
+ endif()
+ endif()
+ endif()
+
+ if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
+ unset(sse4_sources)
+ list(APPEND sse4_sources ${arg_SOURCES})
+
+ list(FILTER sse4_sources INCLUDE REGEX
+ "${libgav1_sse4_source_file_suffix}$")
+
+ if(sse4_sources)
+ unset(sse4_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_sse4_source_file_suffix}
+ VARIABLE sse4_flags)
+ if(sse4_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${sse4_sources} FLAGS
+ ${sse4_flags})
+ endif()
+ endif()
+ endif()
+
+ if(LIBGAV1_ENABLE_NEON AND libgav1_have_neon)
+ unset(neon_sources)
+ list(APPEND neon_sources ${arg_SOURCES})
+ list(FILTER neon_sources INCLUDE REGEX
+ "${libgav1_neon_source_file_suffix}$")
+
+ if(neon_sources AND LIBGAV1_NEON_INTRINSICS_FLAG)
+ unset(neon_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_neon_source_file_suffix}
+ VARIABLE neon_flags)
+ if(neon_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${neon_sources} FLAGS
+ ${neon_flags})
+ endif()
+ endif()
+ endif()
+endmacro()
diff --git a/cmake/libgav1_options.cmake b/cmake/libgav1_options.cmake
new file mode 100644
index 0000000..6327bee
--- /dev/null
+++ b/cmake/libgav1_options.cmake
@@ -0,0 +1,55 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+
+# Simple wrapper for CMake's builtin option command that tracks libgav1's build
+# options in the list variable $libgav1_options.
+macro(libgav1_option)
+ unset(option_NAME)
+ unset(option_HELPSTRING)
+ unset(option_VALUE)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args NAME HELPSTRING VALUE)
+ cmake_parse_arguments(option "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (option_NAME AND option_HELPSTRING AND DEFINED option_VALUE))
+ message(FATAL_ERROR "libgav1_option: NAME HELPSTRING and VALUE required.")
+ endif()
+
+ option(${option_NAME} ${option_HELPSTRING} ${option_VALUE})
+
+ if(LIBGAV1_VERBOSE GREATER 2)
+ message("--------- libgav1_option ---------\n"
+ "option_NAME=${option_NAME}\n"
+ "option_HELPSTRING=${option_HELPSTRING}\n"
+ "option_VALUE=${option_VALUE}\n"
+ "------------------------------------------\n")
+ endif()
+
+ list(APPEND libgav1_options ${option_NAME})
+ list(REMOVE_DUPLICATES libgav1_options)
+endmacro()
+
+# Dumps the $libgav1_options list via CMake message command.
+macro(libgav1_dump_options)
+ foreach(option_name ${libgav1_options})
+ message("${option_name}: ${${option_name}}")
+ endforeach()
+endmacro()
diff --git a/cmake/libgav1_sanitizer.cmake b/cmake/libgav1_sanitizer.cmake
new file mode 100644
index 0000000..4bb2263
--- /dev/null
+++ b/cmake/libgav1_sanitizer.cmake
@@ -0,0 +1,45 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ 1)
+
+macro(libgav1_configure_sanitizer)
+ if(LIBGAV1_SANITIZE AND NOT MSVC)
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ if(LIBGAV1_SANITIZE MATCHES "cfi")
+ list(APPEND LIBGAV1_CXX_FLAGS "-flto" "-fno-sanitize-trap=cfi")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-flto" "-fno-sanitize-trap=cfi"
+ "-fuse-ld=gold")
+ endif()
+
+ if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+ AND LIBGAV1_SANITIZE MATCHES "integer|undefined")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "--rtlib=compiler-rt" "-lgcc_s")
+ endif()
+ endif()
+
+ list(APPEND LIBGAV1_CXX_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+
+ # Make sanitizer callstacks accurate.
+ list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
+ "-fno-optimize-sibling-calls")
+
+ libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+ libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+ endif()
+endmacro()
diff --git a/cmake/libgav1_targets.cmake b/cmake/libgav1_targets.cmake
new file mode 100644
index 0000000..78b4865
--- /dev/null
+++ b/cmake/libgav1_targets.cmake
@@ -0,0 +1,347 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_
+set(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ 1)
+
+# Resets list variables used to track libgav1 targets.
+macro(libgav1_reset_target_lists)
+ unset(libgav1_targets)
+ unset(libgav1_exe_targets)
+ unset(libgav1_lib_targets)
+ unset(libgav1_objlib_targets)
+ unset(libgav1_sources)
+ unset(libgav1_test_targets)
+endmacro()
+
+# Creates an executable target. The target name is passed as a parameter to the
+# NAME argument, and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_test(NAME <name> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+# - OUTPUT_NAME: Override output file basename. Target basename defaults to
+# NAME.
+# - TEST: Flag. Presence means treat executable as a test.
+# - DEFINES: List of preprocessor macro definitions.
+# - INCLUDES: list of include directories for the target.
+# - COMPILE_FLAGS: list of compiler flags for the target.
+# - LINK_FLAGS: List of linker flags for the target.
+# - OBJLIB_DEPS: List of CMake object library target dependencies.
+# - LIB_DEPS: List of CMake library dependencies.
+# cmake-format: on
+#
+# Sources passed to this macro are added to $libgav1_test_sources when TEST is
+# specified. Otherwise sources are added to $libgav1_sources.
+#
+# Targets passed to this macro are always added $libgav1_targets. When TEST is
+# specified targets are also added to list $libgav1_test_targets. Otherwise
+# targets are added to $libgav1_exe_targets.
+macro(libgav1_add_executable)
+ unset(exe_TEST)
+ unset(exe_TEST_DEFINES_MAIN)
+ unset(exe_NAME)
+ unset(exe_OUTPUT_NAME)
+ unset(exe_SOURCES)
+ unset(exe_DEFINES)
+ unset(exe_INCLUDES)
+ unset(exe_COMPILE_FLAGS)
+ unset(exe_LINK_FLAGS)
+ unset(exe_OBJLIB_DEPS)
+ unset(exe_LIB_DEPS)
+ set(optional_args TEST)
+ set(single_value_args NAME OUTPUT_NAME)
+ set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+ OBJLIB_DEPS LIB_DEPS)
+
+ cmake_parse_arguments(exe "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("--------- libgav1_add_executable ---------\n"
+ "exe_TEST=${exe_TEST}\n"
+ "exe_TEST_DEFINES_MAIN=${exe_TEST_DEFINES_MAIN}\n"
+ "exe_NAME=${exe_NAME}\n"
+ "exe_OUTPUT_NAME=${exe_OUTPUT_NAME}\n"
+ "exe_SOURCES=${exe_SOURCES}\n"
+ "exe_DEFINES=${exe_DEFINES}\n"
+ "exe_INCLUDES=${exe_INCLUDES}\n"
+ "exe_COMPILE_FLAGS=${exe_COMPILE_FLAGS}\n"
+ "exe_LINK_FLAGS=${exe_LINK_FLAGS}\n"
+ "exe_OBJLIB_DEPS=${exe_OBJLIB_DEPS}\n"
+ "exe_LIB_DEPS=${exe_LIB_DEPS}\n"
+ "------------------------------------------\n")
+ endif()
+
+ if(NOT (exe_NAME AND exe_SOURCES))
+ message(FATAL_ERROR "libgav1_add_executable: NAME and SOURCES required.")
+ endif()
+
+ list(APPEND libgav1_targets ${exe_NAME})
+ if(exe_TEST)
+ list(APPEND libgav1_test_targets ${exe_NAME})
+ list(APPEND libgav1_test_sources ${exe_SOURCES})
+ else()
+ list(APPEND libgav1_exe_targets ${exe_NAME})
+ list(APPEND libgav1_sources ${exe_SOURCES})
+ endif()
+
+ add_executable(${exe_NAME} ${exe_SOURCES})
+
+ if(exe_OUTPUT_NAME)
+ set_target_properties(${exe_NAME} PROPERTIES OUTPUT_NAME ${exe_OUTPUT_NAME})
+ endif()
+
+ libgav1_process_intrinsics_sources(TARGET ${exe_NAME} SOURCES ${exe_SOURCES})
+
+ if(exe_DEFINES)
+ target_compile_definitions(${exe_NAME} PRIVATE ${exe_DEFINES})
+ endif()
+
+ if(exe_INCLUDES)
+ target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
+ endif()
+
+ if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+ target_compile_options(${exe_NAME}
+ PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+ endif()
+
+ if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
+ set_target_properties(${exe_NAME}
+ PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS}
+ ${LIBGAV1_EXE_LINKER_FLAGS})
+ endif()
+
+ if(exe_OBJLIB_DEPS)
+ foreach(objlib_dep ${exe_OBJLIB_DEPS})
+ target_sources(${exe_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+ endforeach()
+ endif()
+
+ if(CMAKE_THREAD_LIBS_INIT)
+ list(APPEND exe_LIB_DEPS ${CMAKE_THREAD_LIBS_INIT})
+ endif()
+
+ if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+ target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+ endif()
+
+ if(exe_LIB_DEPS)
+ unset(exe_static)
+ if("${CMAKE_EXE_LINKER_FLAGS} ${LIBGAV1_EXE_LINKER_FLAGS}" MATCHES "static")
+ set(exe_static ON)
+ endif()
+
+ if(exe_static AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ # Third party dependencies can introduce dependencies on system and test
+ # libraries. Since the target created here is an executable, and CMake
+ # does not provide a method of controlling order of link dependencies,
+ # wrap all of the dependencies of this target in start/end group flags to
+ # ensure that dependencies of third party targets can be resolved when
+ # those dependencies happen to be resolved by dependencies of the current
+ # target.
+ list(INSERT exe_LIB_DEPS 0 -Wl,--start-group)
+ list(APPEND exe_LIB_DEPS -Wl,--end-group)
+ endif()
+ target_link_libraries(${exe_NAME} PRIVATE ${exe_LIB_DEPS})
+ endif()
+endmacro()
+
+# Creates a library target of the specified type. The target name is passed as a
+# parameter to the NAME argument, the type as a parameter to the TYPE argument,
+# and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_library(NAME <name> TYPE <type> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+# - OUTPUT_NAME: Override output file basename. Target basename defaults to
+# NAME. OUTPUT_NAME is ignored when BUILD_SHARED_LIBS is enabled and CMake
+# is generating a build for which MSVC or WIN32 are true. This is to avoid
+# output basename collisions with DLL import libraries.
+# - TEST: Flag. Presence means treat library as a test.
+# - DEFINES: List of preprocessor macro definitions.
+# - INCLUDES: list of include directories for the target.
+# - COMPILE_FLAGS: list of compiler flags for the target.
+# - LINK_FLAGS: List of linker flags for the target.
+# - OBJLIB_DEPS: List of CMake object library target dependencies.
+# - LIB_DEPS: List of CMake library dependencies.
+# - PUBLIC_INCLUDES: List of include paths to export to dependents.
+# cmake-format: on
+#
+# Sources passed to the macro are added to the lists tracking libgav1 sources:
+# cmake-format: off
+# - When TEST is specified sources are added to $libgav1_test_sources.
+# - Otherwise sources are added to $libgav1_sources.
+# cmake-format: on
+#
+# Targets passed to this macro are added to the lists tracking libgav1 targets:
+# cmake-format: off
+# - Targets are always added to $libgav1_targets.
+# - When the TEST flag is specified, targets are added to
+# $libgav1_test_targets.
+# - When TEST is not specified:
+# - Libraries of type SHARED are added to $libgav1_dylib_targets.
+# - Libraries of type OBJECT are added to $libgav1_objlib_targets.
+# - Libraries of type STATIC are added to $libgav1_lib_targets.
+# cmake-format: on
+macro(libgav1_add_library)
+ unset(lib_TEST)
+ unset(lib_NAME)
+ unset(lib_OUTPUT_NAME)
+ unset(lib_TYPE)
+ unset(lib_SOURCES)
+ unset(lib_DEFINES)
+ unset(lib_INCLUDES)
+ unset(lib_COMPILE_FLAGS)
+ unset(lib_LINK_FLAGS)
+ unset(lib_OBJLIB_DEPS)
+ unset(lib_LIB_DEPS)
+ unset(lib_PUBLIC_INCLUDES)
+ set(optional_args TEST)
+ set(single_value_args NAME OUTPUT_NAME TYPE)
+ set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+ OBJLIB_DEPS LIB_DEPS PUBLIC_INCLUDES)
+
+ cmake_parse_arguments(lib "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("--------- libgav1_add_library ---------\n"
+ "lib_TEST=${lib_TEST}\n"
+ "lib_NAME=${lib_NAME}\n"
+ "lib_OUTPUT_NAME=${lib_OUTPUT_NAME}\n"
+ "lib_TYPE=${lib_TYPE}\n"
+ "lib_SOURCES=${lib_SOURCES}\n"
+ "lib_DEFINES=${lib_DEFINES}\n"
+ "lib_INCLUDES=${lib_INCLUDES}\n"
+ "lib_COMPILE_FLAGS=${lib_COMPILE_FLAGS}\n"
+ "lib_LINK_FLAGS=${lib_LINK_FLAGS}\n"
+ "lib_OBJLIB_DEPS=${lib_OBJLIB_DEPS}\n"
+ "lib_LIB_DEPS=${lib_LIB_DEPS}\n"
+ "lib_PUBLIC_INCLUDES=${lib_PUBLIC_INCLUDES}\n"
+ "---------------------------------------\n")
+ endif()
+
+ if(NOT (lib_NAME AND lib_TYPE AND lib_SOURCES))
+ message(FATAL_ERROR "libgav1_add_library: NAME, TYPE and SOURCES required.")
+ endif()
+
+ list(APPEND libgav1_targets ${lib_NAME})
+ if(lib_TEST)
+ list(APPEND libgav1_test_targets ${lib_NAME})
+ list(APPEND libgav1_test_sources ${lib_SOURCES})
+ else()
+ list(APPEND libgav1_sources ${lib_SOURCES})
+ if(lib_TYPE STREQUAL OBJECT)
+ list(APPEND libgav1_objlib_targets ${lib_NAME})
+ elseif(lib_TYPE STREQUAL SHARED)
+ list(APPEND libgav1_dylib_targets ${lib_NAME})
+ elseif(lib_TYPE STREQUAL STATIC)
+ list(APPEND libgav1_lib_targets ${lib_NAME})
+ else()
+ message(WARNING "libgav1_add_library: Unhandled type: ${lib_TYPE}")
+ endif()
+ endif()
+
+ add_library(${lib_NAME} ${lib_TYPE} ${lib_SOURCES})
+ libgav1_process_intrinsics_sources(TARGET ${lib_NAME} SOURCES ${lib_SOURCES})
+
+ if(lib_OUTPUT_NAME)
+ if(NOT (BUILD_SHARED_LIBS AND (MSVC OR WIN32)))
+ set_target_properties(${lib_NAME}
+ PROPERTIES OUTPUT_NAME ${lib_OUTPUT_NAME})
+ endif()
+ endif()
+
+ if(lib_DEFINES)
+ target_compile_definitions(${lib_NAME} PRIVATE ${lib_DEFINES})
+ endif()
+
+ if(lib_INCLUDES)
+ target_include_directories(${lib_NAME} PRIVATE ${lib_INCLUDES})
+ endif()
+
+ if(lib_PUBLIC_INCLUDES)
+ target_include_directories(${lib_NAME} PUBLIC ${lib_PUBLIC_INCLUDES})
+ endif()
+
+ if(lib_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+ target_compile_options(${lib_NAME}
+ PRIVATE ${lib_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+ endif()
+
+ if(lib_LINK_FLAGS)
+ set_target_properties(${lib_NAME} PROPERTIES LINK_FLAGS ${lib_LINK_FLAGS})
+ endif()
+
+ if(lib_OBJLIB_DEPS)
+ foreach(objlib_dep ${lib_OBJLIB_DEPS})
+ target_sources(${lib_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+ endforeach()
+ endif()
+
+ if(lib_LIB_DEPS)
+ if(lib_TYPE STREQUAL STATIC)
+ set(link_type PUBLIC)
+ else()
+ set(link_type PRIVATE)
+ if(lib_TYPE STREQUAL SHARED AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ # The libgav1 shared object uses the static libgav1 as input to turn it
+ # into a shared object. Include everything from the static library in
+ # the shared object.
+ if(APPLE)
+ list(INSERT lib_LIB_DEPS 0 -Wl,-force_load)
+ else()
+ list(INSERT lib_LIB_DEPS 0 -Wl,--whole-archive)
+ list(APPEND lib_LIB_DEPS -Wl,--no-whole-archive)
+ endif()
+ endif()
+ endif()
+ target_link_libraries(${lib_NAME} ${link_type} ${lib_LIB_DEPS})
+ endif()
+
+ if(NOT MSVC AND lib_NAME MATCHES "^lib")
+ # Non-MSVC generators prepend lib to static lib target file names. Libgav1
+ # already includes lib in its name. Avoid naming output files liblib*.
+ set_target_properties(${lib_NAME} PROPERTIES PREFIX "")
+ endif()
+
+ if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
+ set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION})
+ endif()
+
+ if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+ if(lib_TYPE STREQUAL SHARED)
+ target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=1")
+ else()
+ target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+ endif()
+ endif()
+
+ # Determine if $lib_NAME is a header only target.
+ set(sources_list ${lib_SOURCES})
+ list(FILTER sources_list INCLUDE REGEX cc$)
+ if(NOT sources_list)
+ if(NOT XCODE)
+ # This is a header only target. Tell CMake the link language.
+ set_target_properties(${lib_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+ else()
+ # The Xcode generator ignores LINKER_LANGUAGE. Add a dummy cc file.
+ libgav1_create_dummy_source_file(TARGET ${lib_NAME} BASENAME ${lib_NAME})
+ endif()
+ endif()
+endmacro()
diff --git a/cmake/libgav1_variables.cmake b/cmake/libgav1_variables.cmake
new file mode 100644
index 0000000..0dd0f37
--- /dev/null
+++ b/cmake/libgav1_variables.cmake
@@ -0,0 +1,78 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ 1)
+
+# Halts generation when $variable_name does not refer to a directory that
+# exists.
+macro(libgav1_variable_must_be_directory variable_name)
+ if("${variable_name}" STREQUAL "")
+ message(
+ FATAL_ERROR
+ "Empty variable_name passed to libgav1_variable_must_be_directory.")
+ endif()
+
+ if("${${variable_name}}" STREQUAL "")
+ message(
+ FATAL_ERROR
+ "Empty variable ${variable_name} is required to build libgav1.")
+ endif()
+
+ if(NOT IS_DIRECTORY "${${variable_name}}")
+ message(
+ FATAL_ERROR
+ "${variable_name}, which is ${${variable_name}}, does not refer to a\n"
+ "directory.")
+ endif()
+endmacro()
+
+# Adds $var_name to the tracked variables list.
+macro(libgav1_track_configuration_variable var_name)
+ if(LIBGAV1_VERBOSE GREATER 2)
+ message("---- libgav1_track_configuration_variable ----\n"
+ "var_name=${var_name}\n"
+ "----------------------------------------------\n")
+ endif()
+
+ list(APPEND libgav1_configuration_variables ${var_name})
+ list(REMOVE_DUPLICATES libgav1_configuration_variables)
+endmacro()
+
+# Logs current C++ and executable linker flags via CMake's message command.
+macro(libgav1_dump_cmake_flag_variables)
+ unset(flag_variables)
+ list(APPEND flag_variables "CMAKE_CXX_FLAGS_INIT" "CMAKE_CXX_FLAGS"
+ "CMAKE_EXE_LINKER_FLAGS_INIT" "CMAKE_EXE_LINKER_FLAGS")
+ if(CMAKE_BUILD_TYPE)
+ list(APPEND flag_variables "CMAKE_BUILD_TYPE"
+ "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+ "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}"
+ "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+ "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}")
+ endif()
+ foreach(flag_variable ${flag_variables})
+ message("${flag_variable}:${${flag_variable}}")
+ endforeach()
+endmacro()
+
+# Dumps the variables tracked in $libgav1_configuration_variables via CMake's
+# message command.
+macro(libgav1_dump_tracked_configuration_variables)
+ foreach(config_variable ${libgav1_configuration_variables})
+ message("${config_variable}:${${config_variable}}")
+ endforeach()
+endmacro()
diff --git a/cmake/toolchains/aarch64-linux-gnu.cmake b/cmake/toolchains/aarch64-linux-gnu.cmake
new file mode 100644
index 0000000..7ffe397
--- /dev/null
+++ b/cmake/toolchains/aarch64-linux-gnu.cmake
@@ -0,0 +1,28 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+ set(CROSS aarch64-linux-gnu-)
+endif()
+
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
diff --git a/cmake/toolchains/android.cmake b/cmake/toolchains/android.cmake
new file mode 100644
index 0000000..492957b
--- /dev/null
+++ b/cmake/toolchains/android.cmake
@@ -0,0 +1,53 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+
+# Additional ANDROID_* settings are available, see:
+# https://developer.android.com/ndk/guides/cmake#variables
+
+if(NOT ANDROID_PLATFORM)
+ set(ANDROID_PLATFORM android-21)
+endif()
+
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
+if(NOT ANDROID_ABI)
+ set(ANDROID_ABI arm64-v8a)
+endif()
+
+# Force arm mode for 32-bit targets (instead of the default thumb) to improve
+# performance.
+if(NOT ANDROID_ARM_MODE)
+ set(ANDROID_ARM_MODE arm)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(LIBGAV1_ANDROID_NDK_PATH)
+ set(ENV{LIBGAV1_ANDROID_NDK_PATH} "${LIBGAV1_ANDROID_NDK_PATH}")
+else()
+ set(LIBGAV1_ANDROID_NDK_PATH "$ENV{LIBGAV1_ANDROID_NDK_PATH}")
+endif()
+
+if(NOT LIBGAV1_ANDROID_NDK_PATH)
+ message(FATAL_ERROR "LIBGAV1_ANDROID_NDK_PATH not set.")
+ return()
+endif()
+
+include("${LIBGAV1_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake
new file mode 100644
index 0000000..8051f0d
--- /dev/null
+++ b/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -0,0 +1,29 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+ set(CROSS arm-linux-gnueabihf-)
+endif()
+
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm")
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/codereview.settings b/codereview.settings
new file mode 100644
index 0000000..ccba2ee
--- /dev/null
+++ b/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: chromium-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/examples/file_reader.cc b/examples/file_reader.cc
new file mode 100644
index 0000000..b096722
--- /dev/null
+++ b/examples/file_reader.cc
@@ -0,0 +1,186 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <new>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/ivf_parser.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+ _setmode(_fileno(stream), _O_BINARY);
+#endif
+ return stream;
+}
+
+} // namespace
+
+bool FileReader::registered_in_factory_ =
+ FileReaderFactory::RegisterReader(FileReader::Open);
+
+FileReader::~FileReader() {
+ if (owns_file_) fclose(file_);
+}
+
+std::unique_ptr<FileReaderInterface> FileReader::Open(
+ const std::string& file_name, const bool error_tolerant) {
+ if (file_name.empty()) return nullptr;
+
+ FILE* raw_file_ptr;
+
+ bool owns_file = true;
+ if (file_name == "-") {
+ raw_file_ptr = SetBinaryMode(stdin);
+ owns_file = false; // stdin is owned by the Standard C Library.
+ } else {
+ raw_file_ptr = fopen(file_name.c_str(), "rb");
+ }
+
+ if (raw_file_ptr == nullptr) {
+ return nullptr;
+ }
+
+ std::unique_ptr<FileReader> file(
+ new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
+ if (file == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+ if (owns_file) fclose(raw_file_ptr);
+ return nullptr;
+ }
+
+ if (!file->ReadIvfFileHeader()) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported file type");
+ return nullptr;
+ }
+
+ return file;
+}
+
+// IVF Frame Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3 size of frame in bytes (not including the 12-byte header)
+// bytes 4-11 64-bit presentation timestamp
+// bytes 12.. frame data
+bool FileReader::ReadTemporalUnit(std::vector<uint8_t>* const tu_data,
+ int64_t* const timestamp) {
+ if (tu_data == nullptr) return false;
+ tu_data->clear();
+
+ uint8_t header_buffer[kIvfFrameHeaderSize];
+ const size_t num_read = fread(header_buffer, 1, kIvfFrameHeaderSize, file_);
+
+ if (IsEndOfFile()) {
+ if (num_read != 0) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Cannot read IVF frame header: Not enough data available");
+ return false;
+ }
+
+ return true;
+ }
+
+ IvfFrameHeader ivf_frame_header;
+ if (!ParseIvfFrameHeader(header_buffer, &ivf_frame_header)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF frame header");
+ if (error_tolerant_) {
+ ivf_frame_header.frame_size =
+ std::min(ivf_frame_header.frame_size, size_t{kMaxTemporalUnitSize});
+ } else {
+ return false;
+ }
+ }
+
+ if (timestamp != nullptr) *timestamp = ivf_frame_header.timestamp;
+
+ tu_data->resize(ivf_frame_header.frame_size);
+ const size_t size_read =
+ fread(tu_data->data(), 1, ivf_frame_header.frame_size, file_);
+ if (size_read != ivf_frame_header.frame_size) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Unexpected EOF or I/O error reading frame data");
+ if (error_tolerant_) {
+ tu_data->resize(size_read);
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Attempt to read an IVF file header. Returns true for success, and false for
+// failure.
+//
+// IVF File Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3 signature: 'DKIF'
+// bytes 4-5 version (should be 0)
+// bytes 6-7 length of header in bytes
+// bytes 8-11 codec FourCC (e.g., 'VP80')
+// bytes 12-13 width in pixels
+// bytes 14-15 height in pixels
+// bytes 16-19 frame rate
+// bytes 20-23 time scale
+// bytes 24-27 number of frames in file
+// bytes 28-31 unused
+//
+// Note: The rate and scale fields correspond to the numerator and denominator
+// of frame rate (fps) or time base (the reciprocal of frame rate) as follows:
+//
+// bytes 16-19 frame rate timebase.den framerate.numerator
+// bytes 20-23 time scale timebase.num framerate.denominator
+bool FileReader::ReadIvfFileHeader() {
+ uint8_t header_buffer[kIvfFileHeaderSize];
+ const size_t num_read = fread(header_buffer, 1, kIvfFileHeaderSize, file_);
+ if (num_read != kIvfFileHeaderSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Cannot read IVF header: Not enough data available");
+ return false;
+ }
+
+ IvfFileHeader ivf_file_header;
+ if (!ParseIvfFileHeader(header_buffer, &ivf_file_header)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF file header");
+ if (error_tolerant_) {
+ ivf_file_header = {};
+ } else {
+ return false;
+ }
+ }
+
+ width_ = ivf_file_header.width;
+ height_ = ivf_file_header.height;
+ frame_rate_ = ivf_file_header.frame_rate_numerator;
+ time_scale_ = ivf_file_header.frame_rate_denominator;
+ type_ = kFileTypeIvf;
+
+ return true;
+}
+
+} // namespace libgav1
diff --git a/examples/file_reader.h b/examples/file_reader.h
new file mode 100644
index 0000000..c342a20
--- /dev/null
+++ b/examples/file_reader.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+// Temporal Unit based file reader class. Currently supports only IVF files.
+class FileReader : public FileReaderInterface {
+ public:
+ enum FileType {
+ kFileTypeUnknown,
+ kFileTypeIvf,
+ };
+
+ // Creates and returns a FileReader that reads from |file_name|.
+ // If |error_tolerant| is true format and read errors are ignored,
+ // ReadTemporalUnit() may return truncated data.
+ // Returns nullptr when the file does not exist, cannot be read, or is not an
+ // IVF file.
+ static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
+ bool error_tolerant = false);
+
+ FileReader() = delete;
+ FileReader(const FileReader&) = delete;
+ FileReader& operator=(const FileReader&) = delete;
+
+ // Closes |file_|.
+ ~FileReader() override;
+
+ // Reads a temporal unit from |file_| and writes the data to |tu_data|.
+ // Returns true when:
+ // - A temporal unit is read successfully, or
+ // - At end of file.
+ // When ReadTemporalUnit() is called at the end of the file, it will return
+ // true without writing any data to |tu_data|.
+ //
+ // The |timestamp| pointer is optional: callers not interested in timestamps
+ // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+ // the presentation timestamp from the IVF frame header.
+ /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+ std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
+
+ /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
+ return feof(file_) != 0;
+ }
+
+ // The values returned by these accessors are strictly informative. No
+ // validation is performed when they are read from the IVF file header.
+ size_t width() const override { return width_; }
+ size_t height() const override { return height_; }
+ size_t frame_rate() const override { return frame_rate_; }
+ size_t time_scale() const override { return time_scale_; }
+
+ private:
+ FileReader(FILE* file, bool owns_file, bool error_tolerant)
+ : file_(file), owns_file_(owns_file), error_tolerant_(error_tolerant) {}
+
+ bool ReadIvfFileHeader();
+
+ FILE* file_ = nullptr;
+ size_t width_ = 0;
+ size_t height_ = 0;
+ size_t frame_rate_ = 0;
+ size_t time_scale_ = 0;
+ FileType type_ = kFileTypeUnknown;
+ // True if this object owns file_ and is responsible for closing it when
+ // done.
+ const bool owns_file_;
+ const bool error_tolerant_;
+
+ static bool registered_in_factory_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_H_
diff --git a/examples/file_reader_constants.cc b/examples/file_reader_constants.cc
new file mode 100644
index 0000000..8439071
--- /dev/null
+++ b/examples/file_reader_constants.cc
@@ -0,0 +1,23 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_constants.h"
+
+namespace libgav1 {
+
+const char kIvfSignature[4] = {'D', 'K', 'I', 'F'};
+const char kAv1FourCcUpper[4] = {'A', 'V', '0', '1'};
+const char kAv1FourCcLower[4] = {'a', 'v', '0', '1'};
+
+} // namespace libgav1
diff --git a/examples/file_reader_constants.h b/examples/file_reader_constants.h
new file mode 100644
index 0000000..00922b4
--- /dev/null
+++ b/examples/file_reader_constants.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+
+namespace libgav1 {
+
+enum {
+ kIvfHeaderVersion = 0,
+ kIvfFrameHeaderSize = 12,
+ kIvfFileHeaderSize = 32,
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ kMaxTemporalUnitSize = 512 * 1024,
+#else
+ kMaxTemporalUnitSize = 256 * 1024 * 1024,
+#endif
+};
+
+extern const char kIvfSignature[4];
+extern const char kAv1FourCcUpper[4];
+extern const char kAv1FourCcLower[4];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
diff --git a/examples/file_reader_factory.cc b/examples/file_reader_factory.cc
new file mode 100644
index 0000000..d5260eb
--- /dev/null
+++ b/examples/file_reader_factory.cc
@@ -0,0 +1,51 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <new>
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+std::vector<FileReaderFactory::OpenFunction>* GetFileReaderOpenFunctions() {
+ static auto* open_functions =
+ new (std::nothrow) std::vector<FileReaderFactory::OpenFunction>();
+ return open_functions;
+}
+
+} // namespace
+
+bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
+ if (open_function == nullptr) return false;
+ auto* open_functions = GetFileReaderOpenFunctions();
+ const size_t num_readers = open_functions->size();
+ open_functions->push_back(open_function);
+ return open_functions->size() == num_readers + 1;
+}
+
+std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
+ const std::string& file_name, const bool error_tolerant /*= false*/) {
+ for (auto* open_function : *GetFileReaderOpenFunctions()) {
+ auto reader = open_function(file_name, error_tolerant);
+ if (reader == nullptr) continue;
+ return reader;
+ }
+ LIBGAV1_EXAMPLES_LOG_ERROR("No file reader able to open input");
+ return nullptr;
+}
+
+} // namespace libgav1
diff --git a/examples/file_reader_factory.h b/examples/file_reader_factory.h
new file mode 100644
index 0000000..0f53484
--- /dev/null
+++ b/examples/file_reader_factory.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+
+#include <memory>
+#include <string>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+class FileReaderFactory {
+ public:
+ using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
+ const std::string& file_name, bool error_tolerant);
+
+ FileReaderFactory() = delete;
+ FileReaderFactory(const FileReaderFactory&) = delete;
+ FileReaderFactory& operator=(const FileReaderFactory&) = delete;
+ ~FileReaderFactory() = default;
+
+ // Registers the OpenFunction for a FileReaderInterface and returns true when
+ // registration succeeds.
+ static bool RegisterReader(OpenFunction open_function);
+
+ // Passes |file_name| to each OpenFunction until one succeeds. Returns nullptr
+ // when no reader is found for |file_name|. Otherwise a FileReaderInterface is
+ // returned. If |error_tolerant| is true and the reader supports it, some
+ // format and read errors may be ignored and partial data returned.
+ static std::unique_ptr<FileReaderInterface> OpenReader(
+ const std::string& file_name, bool error_tolerant = false);
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
diff --git a/examples/file_reader_interface.h b/examples/file_reader_interface.h
new file mode 100644
index 0000000..d8f7030
--- /dev/null
+++ b/examples/file_reader_interface.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace libgav1 {
+
+class FileReaderInterface {
+ public:
+ FileReaderInterface() = default;
+ FileReaderInterface(const FileReaderInterface&) = delete;
+ FileReaderInterface& operator=(const FileReaderInterface&) = delete;
+
+ FileReaderInterface(FileReaderInterface&&) = default;
+ FileReaderInterface& operator=(FileReaderInterface&&) = default;
+
+ // Closes the file.
+ virtual ~FileReaderInterface() = default;
+
+ // Reads a temporal unit from the file and writes the data to |tu_data|.
+ // Returns true when:
+ // - A temporal unit is read successfully, or
+ // - At end of file.
+ // When ReadTemporalUnit() is called at the end of the file, it will return
+ // true without writing any data to |tu_data|.
+ //
+ // The |timestamp| pointer is optional: callers not interested in timestamps
+ // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+ // the presentation timestamp of the temporal unit.
+ /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
+ std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
+
+ /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
+
+ // The values returned by these accessors are strictly informative. No
+ // validation is performed when they are read from file.
+ virtual size_t width() const = 0;
+ virtual size_t height() const = 0;
+ virtual size_t frame_rate() const = 0;
+ virtual size_t time_scale() const = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
diff --git a/examples/file_writer.cc b/examples/file_writer.cc
new file mode 100644
index 0000000..54afe14
--- /dev/null
+++ b/examples/file_writer.cc
@@ -0,0 +1,183 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <string>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+ _setmode(_fileno(stream), _O_BINARY);
+#endif
+ return stream;
+}
+
+std::string GetY4mColorSpaceString(
+ const FileWriter::Y4mParameters& y4m_parameters) {
+ std::string color_space_string;
+ switch (y4m_parameters.image_format) {
+ case kImageFormatMonochrome400:
+ color_space_string = "mono";
+ break;
+ case kImageFormatYuv420:
+ if (y4m_parameters.bitdepth == 8) {
+ if (y4m_parameters.chroma_sample_position ==
+ kChromaSamplePositionVertical) {
+ color_space_string = "420mpeg2";
+ } else if (y4m_parameters.chroma_sample_position ==
+ kChromaSamplePositionColocated) {
+ color_space_string = "420";
+ } else {
+ color_space_string = "420jpeg";
+ }
+ } else {
+ color_space_string = "420";
+ }
+ break;
+ case kImageFormatYuv422:
+ color_space_string = "422";
+ break;
+ case kImageFormatYuv444:
+ color_space_string = "444";
+ break;
+ }
+
+ if (y4m_parameters.bitdepth > 8) {
+ const bool monochrome =
+ y4m_parameters.image_format == kImageFormatMonochrome400;
+ if (!monochrome) color_space_string += "p";
+ color_space_string += std::to_string(y4m_parameters.bitdepth);
+ }
+
+ return color_space_string;
+}
+
+} // namespace
+
+FileWriter::~FileWriter() { fclose(file_); }
+
+std::unique_ptr<FileWriter> FileWriter::Open(
+ const std::string& file_name, FileType file_type,
+ const Y4mParameters* const y4m_parameters) {
+ if (file_name.empty() ||
+ (file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
+ (file_type != kFileTypeRaw && file_type != kFileTypeY4m)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Invalid parameters");
+ return nullptr;
+ }
+
+ FILE* raw_file_ptr;
+
+ if (file_name == "-") {
+ raw_file_ptr = SetBinaryMode(stdout);
+ } else {
+ raw_file_ptr = fopen(file_name.c_str(), "wb");
+ }
+
+ if (raw_file_ptr == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unable to open output file");
+ return nullptr;
+ }
+
+ std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
+ if (file == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+ fclose(raw_file_ptr);
+ return nullptr;
+ }
+
+ if (file_type == kFileTypeY4m && !file->WriteY4mFileHeader(*y4m_parameters)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M file header");
+ return nullptr;
+ }
+
+ file->file_type_ = file_type;
+ return file;
+}
+
+bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
+ if (file_type_ == kFileTypeY4m) {
+ const char kY4mFrameHeader[] = "FRAME\n";
+ if (fwrite(kY4mFrameHeader, 1, strlen(kY4mFrameHeader), file_) !=
+ strlen(kY4mFrameHeader)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M frame header");
+ return false;
+ }
+ }
+
+ const size_t pixel_size =
+ (frame_buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+ for (int plane_index = 0; plane_index < frame_buffer.NumPlanes();
+ ++plane_index) {
+ const int height = frame_buffer.displayed_height[plane_index];
+ const int width = frame_buffer.displayed_width[plane_index];
+ const int stride = frame_buffer.stride[plane_index];
+ const uint8_t* const plane_pointer = frame_buffer.plane[plane_index];
+ for (int row = 0; row < height; ++row) {
+ const uint8_t* const row_pointer = &plane_pointer[row * stride];
+ if (fwrite(row_pointer, pixel_size, width, file_) !=
+ static_cast<size_t>(width)) {
+ char error_string[256];
+ snprintf(error_string, sizeof(error_string),
+ "File write failed: %s (errno=%d)", strerror(errno), errno);
+ LIBGAV1_EXAMPLES_LOG_ERROR(error_string);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+// Writes Y4M file header to |file_| and returns true when successful.
+//
+// A Y4M file begins with a plaintext file signature of 'YUV4MPEG2 '.
+//
+// Following the signature is any number of optional parameters preceded by a
+// space. We always write:
+//
+// Width: 'W' followed by image width in pixels.
+// Height: 'H' followed by image height in pixels.
+// Frame Rate: 'F' followed frames/second in the form numerator:denominator.
+// Interlacing: 'I' followed by 'p' for progressive.
+// Color space: 'C' followed by a string representation of the color space.
+//
+// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
+bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
+ std::string y4m_header = "YUV4MPEG2";
+ y4m_header += " W" + std::to_string(y4m_parameters.width);
+ y4m_header += " H" + std::to_string(y4m_parameters.height);
+ y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+ ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+ y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+ y4m_header += "\n";
+ return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
+ y4m_header.length();
+}
+
+} // namespace libgav1
diff --git a/examples/file_writer.h b/examples/file_writer.h
new file mode 100644
index 0000000..00f6cc3
--- /dev/null
+++ b/examples/file_writer.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_WRITER_H_
+#define LIBGAV1_EXAMPLES_FILE_WRITER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// Frame based file writer class. Supports only Y4M (YUV4MPEG2) and RAW output.
+class FileWriter {
+ public:
+ enum FileType : uint8_t {
+ kFileTypeRaw,
+ kFileTypeY4m,
+ };
+
+ struct Y4mParameters {
+ Y4mParameters() = default;
+ Y4mParameters(size_t width, size_t height, size_t frame_rate_numerator,
+ size_t frame_rate_denominator,
+ ChromaSamplePosition chroma_sample_position,
+ ImageFormat image_format, size_t bitdepth)
+ : width(width),
+ height(height),
+ frame_rate_numerator(frame_rate_numerator),
+ frame_rate_denominator(frame_rate_denominator),
+ chroma_sample_position(chroma_sample_position),
+ image_format(image_format),
+ bitdepth(bitdepth) {}
+
+ Y4mParameters(const Y4mParameters& rhs) = default;
+ Y4mParameters& operator=(const Y4mParameters& rhs) = default;
+ Y4mParameters(Y4mParameters&& rhs) = default;
+ Y4mParameters& operator=(Y4mParameters&& rhs) = default;
+
+ size_t width = 0;
+ size_t height = 0;
+ size_t frame_rate_numerator = 30;
+ size_t frame_rate_denominator = 1;
+ ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+ ImageFormat image_format = kImageFormatYuv420;
+ size_t bitdepth = 8;
+ };
+
+ // Opens |file_name|. When |file_type| is kFileTypeY4m the Y4M file header is
+ // written out to |file_| before this method returns.
+ //
+ // Returns a FileWriter instance after the file is opened successfully for
+ // kFileTypeRaw files, and after the Y4M file header bytes are written for
+ // kFileTypeY4m files. Returns nullptr upon failure.
+ static std::unique_ptr<FileWriter> Open(const std::string& file_name,
+ FileType type,
+ const Y4mParameters* y4m_parameters);
+
+ FileWriter() = delete;
+ FileWriter(const FileWriter&) = delete;
+ FileWriter& operator=(const FileWriter&) = delete;
+
+ FileWriter(FileWriter&&) = default;
+ FileWriter& operator=(FileWriter&&) = default;
+
+ // Closes |file_|.
+ ~FileWriter();
+
+ // Writes the frame data in |frame_buffer| to |file_|. Returns true after
+ // successful write of |frame_buffer| data.
+ /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+ const DecoderBuffer& frame_buffer);
+
+ private:
+ explicit FileWriter(FILE* file) : file_(file) {}
+
+ bool WriteY4mFileHeader(const Y4mParameters& y4m_parameters);
+
+ FILE* file_ = nullptr;
+ FileType file_type_ = kFileTypeRaw;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_WRITER_H_
diff --git a/examples/gav1_decode.cc b/examples/gav1_decode.cc
new file mode 100644
index 0000000..4de0ba2
--- /dev/null
+++ b/examples/gav1_decode.cc
@@ -0,0 +1,452 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/file_writer.h"
+#include "gav1/decoder.h"
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+#endif
+
+namespace {
+
+struct Options {
+ const char* input_file_name = nullptr;
+ const char* output_file_name = nullptr;
+ const char* frame_timing_file_name = nullptr;
+ libgav1::FileWriter::FileType output_file_type =
+ libgav1::FileWriter::kFileTypeRaw;
+ uint8_t post_filter_mask = 0x1f;
+ int threads = 1;
+ bool frame_parallel = false;
+ bool output_all_layers = false;
+ int operating_point = 0;
+ int limit = 0;
+ int skip = 0;
+ int verbose = 0;
+};
+
+struct Timing {
+ absl::Duration input;
+ absl::Duration dequeue;
+};
+
+struct FrameTiming {
+ absl::Time enqueue;
+ absl::Time dequeue;
+};
+
+void PrintHelp(FILE* const fout) {
+ fprintf(fout,
+ "Usage: gav1_decode [options] <input file>"
+ " [-o <output file>]\n");
+ fprintf(fout, "\n");
+ fprintf(fout, "Options:\n");
+ fprintf(fout, " -h, --help This help message.\n");
+ fprintf(fout, " --threads <positive integer> (Default 1).\n");
+ fprintf(fout, " --frame_parallel.\n");
+ fprintf(fout,
+ " --limit <integer> Stop decoding after N frames (0 = all).\n");
+ fprintf(fout, " --skip <integer> Skip initial N frames (Default 0).\n");
+ fprintf(fout, " --version.\n");
+ fprintf(fout, " --y4m (Default false).\n");
+ fprintf(fout, " --raw (Default true).\n");
+ fprintf(fout, " -v logging verbosity, can be used multiple times.\n");
+ fprintf(fout, " --all_layers.\n");
+ fprintf(fout,
+ " --operating_point <integer between 0 and 31> (Default 0).\n");
+ fprintf(fout,
+ " --frame_timing <file> Output per-frame timing to <file> in tsv"
+ " format.\n Yields meaningful results only when frame parallel is"
+ " off.\n");
+ fprintf(fout, "\nAdvanced settings:\n");
+ fprintf(fout, " --post_filter_mask <integer> (Default 0x1f).\n");
+ fprintf(fout,
+ " Mask indicating which post filters should be applied to the"
+ " reconstructed\n frame. This may be given as octal, decimal or"
+ " hexadecimal. From LSB:\n");
+ fprintf(fout, " Bit 0: Loop filter (deblocking filter)\n");
+ fprintf(fout, " Bit 1: Cdef\n");
+ fprintf(fout, " Bit 2: SuperRes\n");
+ fprintf(fout, " Bit 3: Loop Restoration\n");
+ fprintf(fout, " Bit 4: Film Grain Synthesis\n");
+}
+
+void ParseOptions(int argc, char* argv[], Options* const options) {
+ for (int i = 1; i < argc; ++i) {
+ int32_t value;
+ if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+ PrintHelp(stdout);
+ exit(EXIT_SUCCESS);
+ } else if (strcmp(argv[i], "-o") == 0) {
+ if (++i >= argc) {
+ fprintf(stderr, "Missing argument for '-o'\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->output_file_name = argv[i];
+ } else if (strcmp(argv[i], "--frame_timing") == 0) {
+ if (++i >= argc) {
+ fprintf(stderr, "Missing argument for '--frame_timing'\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->frame_timing_file_name = argv[i];
+ } else if (strcmp(argv[i], "--version") == 0) {
+ printf("gav1_decode, a libgav1 based AV1 decoder\n");
+ printf("libgav1 %s\n", libgav1::GetVersionString());
+ printf("max bitdepth: %d\n", libgav1::Decoder::GetMaxBitdepth());
+ printf("build configuration: %s\n", libgav1::GetBuildConfiguration());
+ exit(EXIT_SUCCESS);
+ } else if (strcmp(argv[i], "-v") == 0) {
+ ++options->verbose;
+ } else if (strcmp(argv[i], "--raw") == 0) {
+ options->output_file_type = libgav1::FileWriter::kFileTypeRaw;
+ } else if (strcmp(argv[i], "--y4m") == 0) {
+ options->output_file_type = libgav1::FileWriter::kFileTypeY4m;
+ } else if (strcmp(argv[i], "--threads") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value)) {
+ fprintf(stderr, "Missing/Invalid value for --threads.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->threads = value;
+ } else if (strcmp(argv[i], "--frame_parallel") == 0) {
+ options->frame_parallel = true;
+ } else if (strcmp(argv[i], "--all_layers") == 0) {
+ options->output_all_layers = true;
+ } else if (strcmp(argv[i], "--operating_point") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0 ||
+ value >= 32) {
+ fprintf(stderr, "Missing/Invalid value for --operating_point.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->operating_point = value;
+ } else if (strcmp(argv[i], "--limit") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+ fprintf(stderr, "Missing/Invalid value for --limit.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->limit = value;
+ } else if (strcmp(argv[i], "--skip") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+ fprintf(stderr, "Missing/Invalid value for --skip.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->skip = value;
+ } else if (strcmp(argv[i], "--post_filter_mask") == 0) {
+ errno = 0;
+ char* endptr = nullptr;
+ value = (++i >= argc) ? -1
+ // NOLINTNEXTLINE(runtime/deprecated_fn)
+ : static_cast<int32_t>(strtol(argv[i], &endptr, 0));
+ // Only the last 5 bits of the mask can be set.
+ if ((value & ~31) != 0 || errno != 0 || endptr == argv[i]) {
+ fprintf(stderr, "Invalid value for --post_filter_mask.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->post_filter_mask = value;
+ } else if (strlen(argv[i]) > 1 && argv[i][0] == '-') {
+ fprintf(stderr, "Unknown option '%s'!\n", argv[i]);
+ exit(EXIT_FAILURE);
+ } else {
+ if (options->input_file_name == nullptr) {
+ options->input_file_name = argv[i];
+ } else {
+ fprintf(stderr, "Found invalid parameter: \"%s\".\n", argv[i]);
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ }
+ }
+
+ if (argc < 2 || options->input_file_name == nullptr) {
+ fprintf(stderr, "Input file is required!\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+}
+
+using InputBuffer = std::vector<uint8_t>;
+
+class InputBuffers {
+ public:
+ ~InputBuffers() {
+ for (auto buffer : free_buffers_) {
+ delete buffer;
+ }
+ }
+ InputBuffer* GetFreeBuffer() {
+ if (free_buffers_.empty()) {
+ auto* const buffer = new (std::nothrow) InputBuffer();
+ if (buffer == nullptr) {
+ fprintf(stderr, "Failed to create input buffer.\n");
+ return nullptr;
+ }
+ free_buffers_.push_back(buffer);
+ }
+ InputBuffer* const buffer = free_buffers_.front();
+ free_buffers_.pop_front();
+ return buffer;
+ }
+
+ void ReleaseInputBuffer(InputBuffer* buffer) {
+ free_buffers_.push_back(buffer);
+ }
+
+ private:
+ std::deque<InputBuffer*> free_buffers_;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* const input_buffers = static_cast<InputBuffers*>(callback_private_data);
+ input_buffers->ReleaseInputBuffer(
+ static_cast<InputBuffer*>(buffer_private_data));
+}
+
+int CloseFile(FILE* stream) { return (stream == nullptr) ? 0 : fclose(stream); }
+
+} // namespace
+
+int main(int argc, char* argv[]) {
+ Options options;
+ ParseOptions(argc, argv, &options);
+
+ auto file_reader =
+ libgav1::FileReaderFactory::OpenReader(options.input_file_name);
+ if (file_reader == nullptr) {
+ fprintf(stderr, "Cannot open input file!\n");
+ return EXIT_FAILURE;
+ }
+
+ std::unique_ptr<FILE, decltype(&CloseFile)> frame_timing_file(nullptr,
+ &CloseFile);
+ if (options.frame_timing_file_name != nullptr) {
+ frame_timing_file.reset(fopen(options.frame_timing_file_name, "wb"));
+ if (frame_timing_file == nullptr) {
+ fprintf(stderr, "Cannot open frame timing file '%s'!\n",
+ options.frame_timing_file_name);
+ return EXIT_FAILURE;
+ }
+ }
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+ // Reference frames + 1 scratch frame (for either the current frame or the
+ // film grain frame).
+ constexpr int kNumBuffers = 8 + 1;
+ std::unique_ptr<Gav1DecodeCVPixelBufferPool> cv_pixel_buffers =
+ Gav1DecodeCVPixelBufferPool::Create(kNumBuffers);
+ if (cv_pixel_buffers == nullptr) {
+ fprintf(stderr, "Cannot create Gav1DecodeCVPixelBufferPool!\n");
+ return EXIT_FAILURE;
+ }
+#endif
+
+ InputBuffers input_buffers;
+ libgav1::Decoder decoder;
+ libgav1::DecoderSettings settings;
+ settings.post_filter_mask = options.post_filter_mask;
+ settings.threads = options.threads;
+ settings.frame_parallel = options.frame_parallel;
+ settings.output_all_layers = options.output_all_layers;
+ settings.operating_point = options.operating_point;
+ settings.blocking_dequeue = true;
+ settings.callback_private_data = &input_buffers;
+ settings.release_input_buffer = ReleaseInputBuffer;
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+ settings.on_frame_buffer_size_changed = Gav1DecodeOnCVPixelBufferSizeChanged;
+ settings.get_frame_buffer = Gav1DecodeGetCVPixelBuffer;
+ settings.release_frame_buffer = Gav1DecodeReleaseCVPixelBuffer;
+ settings.callback_private_data = cv_pixel_buffers.get();
+ settings.release_input_buffer = nullptr;
+ // TODO(vigneshv): Support frame parallel mode to be used with
+ // CVPixelBufferPool.
+ settings.frame_parallel = false;
+#endif
+ libgav1::StatusCode status = decoder.Init(&settings);
+ if (status != libgav1::kStatusOk) {
+ fprintf(stderr, "Error initializing decoder: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+
+ fprintf(stderr, "decoding '%s'\n", options.input_file_name);
+ if (options.verbose > 0 && options.skip > 0) {
+ fprintf(stderr, "skipping %d frame(s).\n", options.skip);
+ }
+
+ int input_frames = 0;
+ int decoded_frames = 0;
+ Timing timing = {};
+ std::vector<FrameTiming> frame_timing;
+ const bool record_frame_timing = frame_timing_file != nullptr;
+ std::unique_ptr<libgav1::FileWriter> file_writer;
+ InputBuffer* input_buffer = nullptr;
+ bool limit_reached = false;
+ bool dequeue_finished = false;
+ const absl::Time decode_loop_start = absl::Now();
+ do {
+ if (input_buffer == nullptr && !file_reader->IsEndOfFile() &&
+ !limit_reached) {
+ input_buffer = input_buffers.GetFreeBuffer();
+ if (input_buffer == nullptr) return EXIT_FAILURE;
+ const absl::Time read_start = absl::Now();
+ if (!file_reader->ReadTemporalUnit(input_buffer,
+ /*timestamp=*/nullptr)) {
+ fprintf(stderr, "Error reading input file.\n");
+ return EXIT_FAILURE;
+ }
+ timing.input += absl::Now() - read_start;
+ }
+
+ if (++input_frames <= options.skip) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ input_buffer = nullptr;
+ continue;
+ }
+
+ if (input_buffer != nullptr) {
+ if (input_buffer->empty()) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ input_buffer = nullptr;
+ continue;
+ }
+
+ const absl::Time enqueue_start = absl::Now();
+ status = decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+ static_cast<int64_t>(frame_timing.size()),
+ /*buffer_private_data=*/input_buffer);
+ if (status == libgav1::kStatusOk) {
+ if (options.verbose > 1) {
+ fprintf(stderr, "enqueue frame (length %zu)\n", input_buffer->size());
+ }
+ if (record_frame_timing) {
+ FrameTiming enqueue_time = {enqueue_start, absl::UnixEpoch()};
+ frame_timing.emplace_back(enqueue_time);
+ }
+
+ input_buffer = nullptr;
+ // Continue to enqueue frames until we get a kStatusTryAgain status.
+ continue;
+ }
+ if (status != libgav1::kStatusTryAgain) {
+ fprintf(stderr, "Unable to enqueue frame: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+ }
+
+ const libgav1::DecoderBuffer* buffer;
+ status = decoder.DequeueFrame(&buffer);
+ if (status == libgav1::kStatusNothingToDequeue) {
+ dequeue_finished = true;
+ continue;
+ }
+ if (status != libgav1::kStatusOk) {
+ fprintf(stderr, "Unable to dequeue frame: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+ dequeue_finished = false;
+ if (buffer == nullptr) continue;
+ ++decoded_frames;
+ if (options.verbose > 1) {
+ fprintf(stderr, "buffer dequeued\n");
+ }
+
+ if (record_frame_timing) {
+ frame_timing[static_cast<int>(buffer->user_private_data)].dequeue =
+ absl::Now();
+ }
+
+ if (options.output_file_name != nullptr && file_writer == nullptr) {
+ libgav1::FileWriter::Y4mParameters y4m_parameters;
+ y4m_parameters.width = buffer->displayed_width[0];
+ y4m_parameters.height = buffer->displayed_height[0];
+ y4m_parameters.frame_rate_numerator = file_reader->frame_rate();
+ y4m_parameters.frame_rate_denominator = file_reader->time_scale();
+ y4m_parameters.chroma_sample_position = buffer->chroma_sample_position;
+ y4m_parameters.image_format = buffer->image_format;
+ y4m_parameters.bitdepth = static_cast<size_t>(buffer->bitdepth);
+ file_writer = libgav1::FileWriter::Open(
+ options.output_file_name, options.output_file_type, &y4m_parameters);
+ if (file_writer == nullptr) {
+ fprintf(stderr, "Cannot open output file!\n");
+ return EXIT_FAILURE;
+ }
+ }
+
+ if (!limit_reached && file_writer != nullptr &&
+ !file_writer->WriteFrame(*buffer)) {
+ fprintf(stderr, "Error writing output file.\n");
+ return EXIT_FAILURE;
+ }
+ if (options.limit > 0 && options.limit == decoded_frames) {
+ limit_reached = true;
+ if (input_buffer != nullptr) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ }
+ input_buffer = nullptr;
+ }
+ } while (input_buffer != nullptr ||
+ (!file_reader->IsEndOfFile() && !limit_reached) ||
+ !dequeue_finished);
+ timing.dequeue = absl::Now() - decode_loop_start - timing.input;
+
+ if (record_frame_timing) {
+ // Note timing for frame parallel will be skewed by the time spent queueing
+ // additional frames and in the output queue waiting for previous frames,
+ // the values reported won't be that meaningful.
+ fprintf(frame_timing_file.get(), "frame number\tdecode time us\n");
+ for (size_t i = 0; i < frame_timing.size(); ++i) {
+ const int decode_time_us = static_cast<int>(absl::ToInt64Microseconds(
+ frame_timing[i].dequeue - frame_timing[i].enqueue));
+ fprintf(frame_timing_file.get(), "%zu\t%d\n", i, decode_time_us);
+ }
+ }
+
+ if (options.verbose > 0) {
+ fprintf(stderr, "time to read input: %d us\n",
+ static_cast<int>(absl::ToInt64Microseconds(timing.input)));
+ const int decode_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(timing.dequeue));
+ const double decode_fps =
+ (decode_time_us == 0) ? 0.0 : 1.0e6 * decoded_frames / decode_time_us;
+ fprintf(stderr, "time to decode input: %d us (%d frames, %.2f fps)\n",
+ decode_time_us, decoded_frames, decode_fps);
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.cc b/examples/gav1_decode_cv_pixel_buffer_pool.cc
new file mode 100644
index 0000000..6aa4e61
--- /dev/null
+++ b/examples/gav1_decode_cv_pixel_buffer_pool.cc
@@ -0,0 +1,278 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+namespace {
+
+struct CFTypeDeleter {
+ void operator()(CFTypeRef cf) const { CFRelease(cf); }
+};
+
+using UniqueCFNumberRef =
+ std::unique_ptr<std::remove_pointer<CFNumberRef>::type, CFTypeDeleter>;
+
+using UniqueCFDictionaryRef =
+ std::unique_ptr<std::remove_pointer<CFDictionaryRef>::type, CFTypeDeleter>;
+
+} // namespace
+
+extern "C" {
+
+libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ return buffer_pool->OnCVPixelBufferSizeChanged(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment);
+}
+
+libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ return buffer_pool->GetCVPixelBuffer(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ buffer_pool->ReleaseCVPixelBuffer(buffer_private_data);
+}
+
+} // extern "C"
+
+// static
+std::unique_ptr<Gav1DecodeCVPixelBufferPool>
+Gav1DecodeCVPixelBufferPool::Create(size_t num_buffers) {
+ std::unique_ptr<Gav1DecodeCVPixelBufferPool> buffer_pool(
+ new (std::nothrow) Gav1DecodeCVPixelBufferPool(num_buffers));
+ return buffer_pool;
+}
+
+Gav1DecodeCVPixelBufferPool::Gav1DecodeCVPixelBufferPool(size_t num_buffers)
+ : num_buffers_(static_cast<int>(num_buffers)) {}
+
+Gav1DecodeCVPixelBufferPool::~Gav1DecodeCVPixelBufferPool() {
+ CVPixelBufferPoolRelease(pool_);
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::OnCVPixelBufferSizeChanged(
+ int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment) {
+ if (bitdepth != 8 || (image_format != libgav1::kImageFormatYuv420 &&
+ image_format != libgav1::kImageFormatMonochrome400)) {
+ fprintf(stderr,
+ "Only bitdepth 8, 4:2:0 videos are supported: bitdepth %d, "
+ "image_format: %d.\n",
+ bitdepth, image_format);
+ return libgav1::kStatusUnimplemented;
+ }
+
+ // stride_alignment must be a power of 2.
+ assert((stride_alignment & (stride_alignment - 1)) == 0);
+
+ // The possible keys for CVPixelBufferPool are:
+ // kCVPixelBufferPoolMinimumBufferCountKey
+ // kCVPixelBufferPoolMaximumBufferAgeKey
+ // kCVPixelBufferPoolAllocationThresholdKey
+ const void* pool_keys[] = {kCVPixelBufferPoolMinimumBufferCountKey};
+ const int min_buffer_count = 10;
+ UniqueCFNumberRef cf_min_buffer_count(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &min_buffer_count));
+ if (cf_min_buffer_count == nullptr) {
+ fprintf(stderr, "CFNumberCreate failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+ const void* pool_values[] = {cf_min_buffer_count.get()};
+ UniqueCFDictionaryRef pool_attributes(CFDictionaryCreate(
+ nullptr, pool_keys, pool_values, 1, &kCFTypeDictionaryKeyCallBacks,
+ &kCFTypeDictionaryValueCallBacks));
+ if (pool_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+
+ // The pixelBufferAttributes argument to CVPixelBufferPoolCreate() cannot be
+ // null and must contain the pixel format, width, and height, otherwise
+ // CVPixelBufferPoolCreate() fails with kCVReturnInvalidPixelBufferAttributes
+ // (-6682).
+
+ // I420: kCVPixelFormatType_420YpCbCr8Planar (video range).
+ const int pixel_format = (image_format == libgav1::kImageFormatYuv420)
+ ? kCVPixelFormatType_420YpCbCr8PlanarFullRange
+ : kCVPixelFormatType_OneComponent8;
+ UniqueCFNumberRef cf_pixel_format(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &pixel_format));
+ UniqueCFNumberRef cf_width(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &width));
+ UniqueCFNumberRef cf_height(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &height));
+ UniqueCFNumberRef cf_left_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &left_border));
+ UniqueCFNumberRef cf_right_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &right_border));
+ UniqueCFNumberRef cf_top_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &top_border));
+ UniqueCFNumberRef cf_bottom_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &bottom_border));
+ UniqueCFNumberRef cf_stride_alignment(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &stride_alignment));
+
+ const void* buffer_keys[] = {
+ kCVPixelBufferPixelFormatTypeKey,
+ kCVPixelBufferWidthKey,
+ kCVPixelBufferHeightKey,
+ kCVPixelBufferExtendedPixelsLeftKey,
+ kCVPixelBufferExtendedPixelsRightKey,
+ kCVPixelBufferExtendedPixelsTopKey,
+ kCVPixelBufferExtendedPixelsBottomKey,
+ kCVPixelBufferBytesPerRowAlignmentKey,
+ };
+ const void* buffer_values[] = {
+ cf_pixel_format.get(), cf_width.get(),
+ cf_height.get(), cf_left_border.get(),
+ cf_right_border.get(), cf_top_border.get(),
+ cf_bottom_border.get(), cf_stride_alignment.get(),
+ };
+ UniqueCFDictionaryRef buffer_attributes(CFDictionaryCreate(
+ kCFAllocatorDefault, buffer_keys, buffer_values, 8,
+ &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+ if (buffer_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate of buffer_attributes failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+ CVPixelBufferPoolRef cv_pool;
+ CVReturn ret = CVPixelBufferPoolCreate(
+ /*allocator=*/nullptr, pool_attributes.get(), buffer_attributes.get(),
+ &cv_pool);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "CVPixelBufferPoolCreate failed: %d.\n",
+ static_cast<int>(ret));
+ return libgav1::kStatusOutOfMemory;
+ }
+ CVPixelBufferPoolRelease(pool_);
+ pool_ = cv_pool;
+ return libgav1::kStatusOk;
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::GetCVPixelBuffer(
+ int bitdepth, libgav1::ImageFormat image_format, int /*width*/,
+ int /*height*/, int /*left_border*/, int /*right_border*/,
+ int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/,
+ libgav1::FrameBuffer* frame_buffer) {
+ static_cast<void>(bitdepth);
+ assert(bitdepth == 8 && (image_format == libgav1::kImageFormatYuv420 ||
+ image_format == libgav1::kImageFormatMonochrome400));
+ const bool is_monochrome =
+ (image_format == libgav1::kImageFormatMonochrome400);
+
+ // The dictionary must have kCVPixelBufferPoolAllocationThresholdKey,
+ // otherwise CVPixelBufferPoolCreatePixelBufferWithAuxAttributes() fails with
+ // kCVReturnWouldExceedAllocationThreshold (-6689).
+ UniqueCFNumberRef cf_num_buffers(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &num_buffers_));
+
+ const void* buffer_keys[] = {
+ kCVPixelBufferPoolAllocationThresholdKey,
+ };
+ const void* buffer_values[] = {
+ cf_num_buffers.get(),
+ };
+ UniqueCFDictionaryRef aux_attributes(CFDictionaryCreate(
+ kCFAllocatorDefault, buffer_keys, buffer_values, 1,
+ &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+ if (aux_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate of aux_attributes failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+
+ CVPixelBufferRef pixel_buffer;
+ CVReturn ret = CVPixelBufferPoolCreatePixelBufferWithAuxAttributes(
+ /*allocator=*/nullptr, pool_, aux_attributes.get(), &pixel_buffer);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr,
+ "CVPixelBufferPoolCreatePixelBufferWithAuxAttributes failed: %d.\n",
+ static_cast<int>(ret));
+ return libgav1::kStatusOutOfMemory;
+ }
+
+ ret = CVPixelBufferLockBaseAddress(pixel_buffer, /*lockFlags=*/0);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "CVPixelBufferLockBaseAddress failed: %d.\n",
+ static_cast<int>(ret));
+ CFRelease(pixel_buffer);
+ return libgav1::kStatusUnknownError;
+ }
+
+ // If the pixel format type is kCVPixelFormatType_OneComponent8, the pixel
+ // buffer is nonplanar (CVPixelBufferIsPlanar returns false and
+ // CVPixelBufferGetPlaneCount returns 0), but
+ // CVPixelBufferGetBytesPerRowOfPlane and CVPixelBufferGetBaseAddressOfPlane
+ // still work for plane index 0, even though the documentation says they
+ // return NULL for nonplanar pixel buffers.
+ frame_buffer->stride[0] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 0));
+ frame_buffer->plane[0] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 0));
+ if (is_monochrome) {
+ frame_buffer->stride[1] = 0;
+ frame_buffer->stride[2] = 0;
+ frame_buffer->plane[1] = nullptr;
+ frame_buffer->plane[2] = nullptr;
+ } else {
+ frame_buffer->stride[1] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 1));
+ frame_buffer->stride[2] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 2));
+ frame_buffer->plane[1] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 1));
+ frame_buffer->plane[2] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 2));
+ }
+ frame_buffer->private_data = pixel_buffer;
+
+ return libgav1::kStatusOk;
+}
+
+void Gav1DecodeCVPixelBufferPool::ReleaseCVPixelBuffer(
+ void* buffer_private_data) {
+ auto const pixel_buffer = static_cast<CVPixelBufferRef>(buffer_private_data);
+ CVReturn ret =
+ CVPixelBufferUnlockBaseAddress(pixel_buffer, /*unlockFlags=*/0);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "%s:%d: CVPixelBufferUnlockBaseAddress failed: %d.\n",
+ __FILE__, __LINE__, static_cast<int>(ret));
+ abort();
+ }
+ CFRelease(pixel_buffer);
+}
diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.h b/examples/gav1_decode_cv_pixel_buffer_pool.h
new file mode 100644
index 0000000..7aee324
--- /dev/null
+++ b/examples/gav1_decode_cv_pixel_buffer_pool.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+#define LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+
+#include <CoreVideo/CoreVideo.h>
+
+#include <cstddef>
+#include <memory>
+
+#include "gav1/frame_buffer.h"
+
+extern "C" libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment);
+
+extern "C" libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer);
+
+extern "C" void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+ void* buffer_private_data);
+
+class Gav1DecodeCVPixelBufferPool {
+ public:
+ static std::unique_ptr<Gav1DecodeCVPixelBufferPool> Create(
+ size_t num_buffers);
+
+ // Not copyable or movable.
+ Gav1DecodeCVPixelBufferPool(const Gav1DecodeCVPixelBufferPool&) = delete;
+ Gav1DecodeCVPixelBufferPool& operator=(const Gav1DecodeCVPixelBufferPool&) =
+ delete;
+
+ ~Gav1DecodeCVPixelBufferPool();
+
+ libgav1::StatusCode OnCVPixelBufferSizeChanged(
+ int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment);
+
+ libgav1::StatusCode GetCVPixelBuffer(int bitdepth,
+ libgav1::ImageFormat image_format,
+ int width, int height, int left_border,
+ int right_border, int top_border,
+ int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer);
+ void ReleaseCVPixelBuffer(void* buffer_private_data);
+
+ private:
+ Gav1DecodeCVPixelBufferPool(size_t num_buffers);
+
+ CVPixelBufferPoolRef pool_ = nullptr;
+ const int num_buffers_;
+};
+
+#endif // LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
diff --git a/examples/ivf_parser.cc b/examples/ivf_parser.cc
new file mode 100644
index 0000000..f8adb14
--- /dev/null
+++ b/examples/ivf_parser.cc
@@ -0,0 +1,96 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/ivf_parser.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include "examples/file_reader_constants.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+size_t ReadLittleEndian16(const uint8_t* const buffer) {
+ size_t value = buffer[1] << 8;
+ value |= buffer[0];
+ return value;
+}
+
+size_t ReadLittleEndian32(const uint8_t* const buffer) {
+ size_t value = buffer[3] << 24;
+ value |= buffer[2] << 16;
+ value |= buffer[1] << 8;
+ value |= buffer[0];
+ return value;
+}
+
+} // namespace
+
+bool ParseIvfFileHeader(const uint8_t* const header_buffer,
+ IvfFileHeader* const ivf_file_header) {
+ if (header_buffer == nullptr || ivf_file_header == nullptr) return false;
+
+ if (memcmp(kIvfSignature, header_buffer, 4) != 0) {
+ return false;
+ }
+
+ // Verify header version and length.
+ const size_t ivf_header_version = ReadLittleEndian16(&header_buffer[4]);
+ if (ivf_header_version != kIvfHeaderVersion) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unexpected IVF version");
+ }
+
+ const size_t ivf_header_size = ReadLittleEndian16(&header_buffer[6]);
+ if (ivf_header_size != kIvfFileHeaderSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Invalid IVF file header size");
+ return false;
+ }
+
+ if (memcmp(kAv1FourCcLower, &header_buffer[8], 4) != 0 &&
+ memcmp(kAv1FourCcUpper, &header_buffer[8], 4) != 0) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported codec 4CC");
+ return false;
+ }
+
+ ivf_file_header->width = ReadLittleEndian16(&header_buffer[12]);
+ ivf_file_header->height = ReadLittleEndian16(&header_buffer[14]);
+ ivf_file_header->frame_rate_numerator =
+ ReadLittleEndian32(&header_buffer[16]);
+ ivf_file_header->frame_rate_denominator =
+ ReadLittleEndian32(&header_buffer[20]);
+
+ return true;
+}
+
+bool ParseIvfFrameHeader(const uint8_t* const header_buffer,
+ IvfFrameHeader* const ivf_frame_header) {
+ if (header_buffer == nullptr || ivf_frame_header == nullptr) return false;
+
+ ivf_frame_header->frame_size = ReadLittleEndian32(header_buffer);
+ if (ivf_frame_header->frame_size > kMaxTemporalUnitSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Temporal Unit size exceeds maximum");
+ return false;
+ }
+
+ ivf_frame_header->timestamp = ReadLittleEndian32(&header_buffer[4]);
+ const uint64_t timestamp_hi =
+ static_cast<uint64_t>(ReadLittleEndian32(&header_buffer[8])) << 32;
+ ivf_frame_header->timestamp |= timestamp_hi;
+
+ return true;
+}
+
+} // namespace libgav1
diff --git a/examples/ivf_parser.h b/examples/ivf_parser.h
new file mode 100644
index 0000000..b6bbc59
--- /dev/null
+++ b/examples/ivf_parser.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_IVF_PARSER_H_
+#define LIBGAV1_EXAMPLES_IVF_PARSER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+
+struct IvfFileHeader {
+ IvfFileHeader() = default;
+ IvfFileHeader(const IvfFileHeader& rhs) = default;
+ IvfFileHeader& operator=(const IvfFileHeader& rhs) = default;
+ IvfFileHeader(IvfFileHeader&& rhs) = default;
+ IvfFileHeader& operator=(IvfFileHeader&& rhs) = default;
+
+ size_t width = 0;
+ size_t height = 0;
+ size_t frame_rate_numerator = 0;
+ size_t frame_rate_denominator = 0;
+};
+
+struct IvfFrameHeader {
+ IvfFrameHeader() = default;
+ IvfFrameHeader(const IvfFrameHeader& rhs) = default;
+ IvfFrameHeader& operator=(const IvfFrameHeader& rhs) = default;
+ IvfFrameHeader(IvfFrameHeader&& rhs) = default;
+ IvfFrameHeader& operator=(IvfFrameHeader&& rhs) = default;
+
+ size_t frame_size = 0;
+ int64_t timestamp = 0;
+};
+
+bool ParseIvfFileHeader(const uint8_t* header_buffer,
+ IvfFileHeader* ivf_file_header);
+
+bool ParseIvfFrameHeader(const uint8_t* header_buffer,
+ IvfFrameHeader* ivf_frame_header);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_IVF_PARSER_H_
diff --git a/examples/libgav1_examples.cmake b/examples/libgav1_examples.cmake
new file mode 100644
index 0000000..1f949f3
--- /dev/null
+++ b/examples/libgav1_examples.cmake
@@ -0,0 +1,63 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
+ return()
+endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
+set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
+
+set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
+ "${libgav1_examples}/file_reader.h"
+ "${libgav1_examples}/file_reader_constants.cc"
+ "${libgav1_examples}/file_reader_constants.h"
+ "${libgav1_examples}/file_reader_factory.cc"
+ "${libgav1_examples}/file_reader_factory.h"
+ "${libgav1_examples}/file_reader_interface.h"
+ "${libgav1_examples}/ivf_parser.cc"
+ "${libgav1_examples}/ivf_parser.h"
+ "${libgav1_examples}/logging.h")
+
+set(libgav1_file_writer_sources "${libgav1_examples}/file_writer.cc"
+ "${libgav1_examples}/file_writer.h"
+ "${libgav1_examples}/logging.h")
+
+set(libgav1_decode_sources "${libgav1_examples}/gav1_decode.cc")
+
+macro(libgav1_add_examples_targets)
+ libgav1_add_library(NAME libgav1_file_reader TYPE OBJECT SOURCES
+ ${libgav1_file_reader_sources} DEFINES ${libgav1_defines}
+ INCLUDES ${libgav1_include_paths})
+
+ libgav1_add_library(NAME libgav1_file_writer TYPE OBJECT SOURCES
+ ${libgav1_file_writer_sources} DEFINES ${libgav1_defines}
+ INCLUDES ${libgav1_include_paths})
+
+ libgav1_add_executable(NAME
+ gav1_decode
+ SOURCES
+ ${libgav1_decode_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ ${libgav1_gtest_include_paths}
+ OBJLIB_DEPS
+ libgav1_file_reader
+ libgav1_file_writer
+ LIB_DEPS
+ absl::strings
+ absl::str_format_internal
+ absl::time
+ ${libgav1_dependency})
+endmacro()
diff --git a/examples/logging.h b/examples/logging.h
new file mode 100644
index 0000000..c0bcad7
--- /dev/null
+++ b/examples/logging.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_LOGGING_H_
+#define LIBGAV1_EXAMPLES_LOGGING_H_
+
+#include <cstddef>
+#include <cstdio>
+
+namespace libgav1 {
+namespace examples {
+
+#if !defined(LIBGAV1_EXAMPLES_ENABLE_LOGGING)
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+ return (offset == 0 || file_name[offset - 1] == '/' ||
+ file_name[offset - 1] == '\\')
+ ? file_name + offset
+ : Basename(file_name, offset - 1);
+}
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+ do { \
+ constexpr const char* libgav1_examples_basename = \
+ ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
+ __func__, error_string); \
+ } while (false)
+
+#else // !LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+ do { \
+ } while (false)
+
+#endif // LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+} // namespace examples
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_LOGGING_H_
diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc
new file mode 100644
index 0000000..c1a5606
--- /dev/null
+++ b/src/buffer_pool.cc
@@ -0,0 +1,218 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buffer_pool.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+namespace {
+
+// Copies the feature_enabled, feature_data, segment_id_pre_skip, and
+// last_active_segment_id fields of Segmentation.
+void CopySegmentationParameters(const Segmentation& from, Segmentation* to) {
+ memcpy(to->feature_enabled, from.feature_enabled,
+ sizeof(to->feature_enabled));
+ memcpy(to->feature_data, from.feature_data, sizeof(to->feature_data));
+ to->segment_id_pre_skip = from.segment_id_pre_skip;
+ to->last_active_segment_id = from.last_active_segment_id;
+}
+
+} // namespace
+
+RefCountedBuffer::RefCountedBuffer() = default;
+
+RefCountedBuffer::~RefCountedBuffer() = default;
+
+bool RefCountedBuffer::Realloc(int bitdepth, bool is_monochrome, int width,
+ int height, int subsampling_x, int subsampling_y,
+ int left_border, int right_border,
+ int top_border, int bottom_border) {
+ // The YuvBuffer::Realloc() could call the get frame buffer callback which
+ // will need to be thread safe. So we ensure that we only call Realloc() once
+ // at any given time.
+ std::lock_guard<std::mutex> lock(pool_->mutex_);
+ assert(!buffer_private_data_valid_);
+ if (!yuv_buffer_.Realloc(
+ bitdepth, is_monochrome, width, height, subsampling_x, subsampling_y,
+ left_border, right_border, top_border, bottom_border,
+ pool_->get_frame_buffer_, pool_->callback_private_data_,
+ &buffer_private_data_)) {
+ return false;
+ }
+ buffer_private_data_valid_ = true;
+ return true;
+}
+
+bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
+ upscaled_width_ = frame_header.upscaled_width;
+ frame_width_ = frame_header.width;
+ frame_height_ = frame_header.height;
+ render_width_ = frame_header.render_width;
+ render_height_ = frame_header.render_height;
+ rows4x4_ = frame_header.rows4x4;
+ columns4x4_ = frame_header.columns4x4;
+ if (frame_header.refresh_frame_flags != 0 &&
+ !IsIntraFrame(frame_header.frame_type)) {
+ const int rows4x4_half = DivideBy2(rows4x4_);
+ const int columns4x4_half = DivideBy2(columns4x4_);
+ if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+ return false;
+ }
+ }
+ return segmentation_map_.Allocate(rows4x4_, columns4x4_);
+}
+
+void RefCountedBuffer::SetGlobalMotions(
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions) {
+ for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+ static_assert(sizeof(global_motion_[ref].params) ==
+ sizeof(global_motions[ref].params),
+ "");
+ memcpy(global_motion_[ref].params, global_motions[ref].params,
+ sizeof(global_motion_[ref].params));
+ }
+}
+
+void RefCountedBuffer::SetFrameContext(const SymbolDecoderContext& context) {
+ frame_context_ = context;
+ frame_context_.ResetIntraFrameYModeCdf();
+ frame_context_.ResetCounters();
+}
+
+void RefCountedBuffer::GetSegmentationParameters(
+ Segmentation* segmentation) const {
+ CopySegmentationParameters(/*from=*/segmentation_, /*to=*/segmentation);
+}
+
+void RefCountedBuffer::SetSegmentationParameters(
+ const Segmentation& segmentation) {
+ CopySegmentationParameters(/*from=*/segmentation, /*to=*/&segmentation_);
+}
+
+void RefCountedBuffer::SetBufferPool(BufferPool* pool) { pool_ = pool; }
+
+void RefCountedBuffer::ReturnToBufferPool(RefCountedBuffer* ptr) {
+ ptr->pool_->ReturnUnusedBuffer(ptr);
+}
+
+BufferPool::BufferPool(
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+ GetFrameBufferCallback get_frame_buffer,
+ ReleaseFrameBufferCallback release_frame_buffer,
+ void* callback_private_data) {
+ if (get_frame_buffer != nullptr) {
+ // on_frame_buffer_size_changed may be null.
+ assert(release_frame_buffer != nullptr);
+ on_frame_buffer_size_changed_ = on_frame_buffer_size_changed;
+ get_frame_buffer_ = get_frame_buffer;
+ release_frame_buffer_ = release_frame_buffer;
+ callback_private_data_ = callback_private_data;
+ } else {
+ on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
+ get_frame_buffer_ = GetInternalFrameBuffer;
+ release_frame_buffer_ = ReleaseInternalFrameBuffer;
+ callback_private_data_ = &internal_frame_buffers_;
+ }
+}
+
+BufferPool::~BufferPool() {
+ for (const auto* buffer : buffers_) {
+ if (buffer->in_use_) {
+ assert(false && "RefCountedBuffer still in use at destruction time.");
+ LIBGAV1_DLOG(ERROR, "RefCountedBuffer still in use at destruction time.");
+ }
+ delete buffer;
+ }
+}
+
+bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
+ Libgav1ImageFormat image_format,
+ int width, int height,
+ int left_border, int right_border,
+ int top_border, int bottom_border) {
+ if (on_frame_buffer_size_changed_ == nullptr) return true;
+ return on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+ image_format, width, height, left_border,
+ right_border, top_border, bottom_border,
+ /*stride_alignment=*/16) == kStatusOk;
+}
+
+RefCountedBufferPtr BufferPool::GetFreeBuffer() {
+ // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen
+ // from the same thread serially, but the GetFreeBuffer() call in
+ // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same
+ // time. So this function has to be thread safe.
+ // TODO(b/142583029): Investigate if the GetFreeBuffer() call in
+ // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function
+ // need not be thread safe.
+ std::unique_lock<std::mutex> lock(mutex_);
+ for (auto buffer : buffers_) {
+ if (!buffer->in_use_) {
+ buffer->in_use_ = true;
+ buffer->progress_row_ = -1;
+ buffer->frame_state_ = kFrameStateUnknown;
+ lock.unlock();
+ return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+ }
+ }
+ lock.unlock();
+ auto* const buffer = new (std::nothrow) RefCountedBuffer();
+ if (buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate a new reference counted buffer.");
+ return RefCountedBufferPtr();
+ }
+ buffer->SetBufferPool(this);
+ buffer->in_use_ = true;
+ buffer->progress_row_ = -1;
+ buffer->frame_state_ = kFrameStateUnknown;
+ lock.lock();
+ const bool ok = buffers_.push_back(buffer);
+ lock.unlock();
+ if (!ok) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Failed to push the new reference counted buffer into the vector.");
+ delete buffer;
+ return RefCountedBufferPtr();
+ }
+ return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+}
+
+void BufferPool::Abort() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ for (auto buffer : buffers_) {
+ if (buffer->in_use_) {
+ buffer->Abort();
+ }
+ }
+}
+
+void BufferPool::ReturnUnusedBuffer(RefCountedBuffer* buffer) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ assert(buffer->in_use_);
+ buffer->in_use_ = false;
+ if (buffer->buffer_private_data_valid_) {
+ release_frame_buffer_(callback_private_data_, buffer->buffer_private_data_);
+ buffer->buffer_private_data_valid_ = false;
+ }
+}
+
+} // namespace libgav1
diff --git a/src/buffer_pool.h b/src/buffer_pool.h
new file mode 100644
index 0000000..f35a633
--- /dev/null
+++ b/src/buffer_pool.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_BUFFER_POOL_H_
+#define LIBGAV1_SRC_BUFFER_POOL_H_
+
+#include <array>
+#include <cassert>
+#include <climits>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <cstring>
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+class BufferPool;
+
+enum FrameState : uint8_t {
+ kFrameStateUnknown,
+ kFrameStateStarted,
+ kFrameStateParsed,
+ kFrameStateDecoded
+};
+
+// A reference-counted frame buffer. Clients should access it via
+// RefCountedBufferPtr, which manages reference counting transparently.
+class RefCountedBuffer {
+ public:
+ // Not copyable or movable.
+ RefCountedBuffer(const RefCountedBuffer&) = delete;
+ RefCountedBuffer& operator=(const RefCountedBuffer&) = delete;
+
+ // Allocates the YUV buffer. Returns true on success. Returns false on
+ // failure. This function ensures the thread safety of the |get_frame_buffer_|
+ // call (i.e.) only one |get_frame_buffer_| call will happen at a given time.
+ // TODO(b/142583029): In frame parallel mode, we can require the callbacks to
+ // be thread safe so that we can remove the thread safety of this function and
+ // applications can have fine grained locks.
+ //
+ // * |width| and |height| are the image dimensions in pixels.
+ // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+ // subsampling of the width and height of the chroma planes, respectively.
+ // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+ // the sizes (in pixels) of the borders on the left, right, top, and
+ // bottom sides, respectively.
+ //
+ // NOTE: The strides are a multiple of 16. Since the first row in each plane
+ // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+ bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+ int subsampling_x, int subsampling_y, int left_border,
+ int right_border, int top_border, int bottom_border);
+
+ YuvBuffer* buffer() { return &yuv_buffer_; }
+
+ // Returns the buffer private data set by the get frame buffer callback when
+ // it allocated the YUV buffer.
+ void* buffer_private_data() const {
+ assert(buffer_private_data_valid_);
+ return buffer_private_data_;
+ }
+
+ // NOTE: In the current frame, this is the frame_type syntax element in the
+ // frame header. In a reference frame, this implements the RefFrameType array
+ // in the spec.
+ FrameType frame_type() const { return frame_type_; }
+ void set_frame_type(FrameType frame_type) { frame_type_ = frame_type; }
+
+ // The sample position for subsampled streams. This is the
+ // chroma_sample_position syntax element in the sequence header.
+ //
+ // NOTE: The decoder does not use chroma_sample_position, but it needs to be
+ // passed on to the client in DecoderBuffer.
+ ChromaSamplePosition chroma_sample_position() const {
+ return chroma_sample_position_;
+ }
+ void set_chroma_sample_position(ChromaSamplePosition chroma_sample_position) {
+ chroma_sample_position_ = chroma_sample_position;
+ }
+
+ // Whether the frame can be used as show existing frame in the future.
+ bool showable_frame() const { return showable_frame_; }
+ void set_showable_frame(bool value) { showable_frame_ = value; }
+
+ // Sets upscaled_width_, frame_width_, frame_height_, render_width_,
+ // render_height_, rows4x4_ and columns4x4_ from the corresponding fields
+ // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+ // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+ // success, false on failure.
+ bool SetFrameDimensions(const ObuFrameHeader& frame_header);
+
+ int32_t upscaled_width() const { return upscaled_width_; }
+ int32_t frame_width() const { return frame_width_; }
+ int32_t frame_height() const { return frame_height_; }
+ // RenderWidth() and RenderHeight() return the render size, which is a hint
+ // to the application about the desired display size.
+ int32_t render_width() const { return render_width_; }
+ int32_t render_height() const { return render_height_; }
+ int32_t rows4x4() const { return rows4x4_; }
+ int32_t columns4x4() const { return columns4x4_; }
+
+ int spatial_id() const { return spatial_id_; }
+ void set_spatial_id(int value) { spatial_id_ = value; }
+ int temporal_id() const { return temporal_id_; }
+ void set_temporal_id(int value) { temporal_id_ = value; }
+
+ SegmentationMap* segmentation_map() { return &segmentation_map_; }
+ const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
+
+ // Only the |params| field of each GlobalMotion struct should be used.
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& GlobalMotions()
+ const {
+ return global_motion_;
+ }
+ // Saves the GlobalMotion array. Only the |params| field of each GlobalMotion
+ // struct is saved.
+ void SetGlobalMotions(
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions);
+
+ // Returns the saved CDF tables.
+ const SymbolDecoderContext& FrameContext() const { return frame_context_; }
+ // Saves the CDF tables. The intra_frame_y_mode_cdf table is reset to the
+ // default. The last entry in each table, representing the symbol count for
+ // that context, is set to 0.
+ void SetFrameContext(const SymbolDecoderContext& context);
+
+ const std::array<int8_t, kNumReferenceFrameTypes>& loop_filter_ref_deltas()
+ const {
+ return loop_filter_ref_deltas_;
+ }
+ const std::array<int8_t, kLoopFilterMaxModeDeltas>& loop_filter_mode_deltas()
+ const {
+ return loop_filter_mode_deltas_;
+ }
+ // Saves the ref_deltas and mode_deltas arrays in loop_filter.
+ void SetLoopFilterDeltas(const LoopFilter& loop_filter) {
+ loop_filter_ref_deltas_ = loop_filter.ref_deltas;
+ loop_filter_mode_deltas_ = loop_filter.mode_deltas;
+ }
+
+ // Copies the saved values of the following fields to the Segmentation
+ // struct: feature_enabled, feature_data, segment_id_pre_skip, and
+ // last_active_segment_id. The other fields are left unchanged.
+ void GetSegmentationParameters(Segmentation* segmentation) const;
+ // Saves the feature_enabled, feature_data, segment_id_pre_skip, and
+ // last_active_segment_id fields of the Segmentation struct.
+ void SetSegmentationParameters(const Segmentation& segmentation);
+
+ const FilmGrainParams& film_grain_params() const {
+ return film_grain_params_;
+ }
+ void set_film_grain_params(const FilmGrainParams& params) {
+ film_grain_params_ = params;
+ }
+
+ const ReferenceInfo* reference_info() const { return &reference_info_; }
+ ReferenceInfo* reference_info() { return &reference_info_; }
+
+ // This will wake up the WaitUntil*() functions and make them return false.
+ void Abort() {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ abort_ = true;
+ }
+ parsed_condvar_.notify_all();
+ decoded_condvar_.notify_all();
+ progress_row_condvar_.notify_all();
+ }
+
+ void SetFrameState(FrameState frame_state) {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ frame_state_ = frame_state;
+ }
+ if (frame_state == kFrameStateParsed) {
+ parsed_condvar_.notify_all();
+ } else if (frame_state == kFrameStateDecoded) {
+ decoded_condvar_.notify_all();
+ progress_row_condvar_.notify_all();
+ }
+ }
+
+ // Sets the progress of this frame to |progress_row| and notifies any threads
+ // that may be waiting on rows <= |progress_row|.
+ void SetProgress(int progress_row) {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (progress_row_ >= progress_row) return;
+ progress_row_ = progress_row;
+ }
+ progress_row_condvar_.notify_all();
+ }
+
+ void MarkFrameAsStarted() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (frame_state_ != kFrameStateUnknown) return;
+ frame_state_ = kFrameStateStarted;
+ }
+
+ // All the WaitUntil* functions will return true if the desired wait state was
+ // reached successfully. If the return value is false, then the caller must
+ // assume that the wait was not successful and try to stop whatever they are
+ // doing as early as possible.
+
+ // Waits until the frame has been parsed.
+ bool WaitUntilParsed() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (frame_state_ < kFrameStateParsed && !abort_) {
+ parsed_condvar_.wait(lock);
+ }
+ return !abort_;
+ }
+
+ // Waits until the |progress_row| has been decoded (as indicated either by
+ // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+ // nullptr and will be populated with the value of |progress_row_| after the
+ // wait.
+ //
+ // Typical usage of |progress_row_cache| is as follows:
+ // * Initialize |*progress_row_cache| to INT_MIN.
+ // * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+ bool WaitUntil(int progress_row, int* progress_row_cache) {
+ // If |progress_row| is negative, it means that the wait is on the top
+ // border to be available. The top border will be available when row 0 has
+ // been decoded. So we can simply wait on row 0 instead.
+ progress_row = std::max(progress_row, 0);
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (progress_row_ < progress_row && frame_state_ != kFrameStateDecoded &&
+ !abort_) {
+ progress_row_condvar_.wait(lock);
+ }
+ // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+ // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+ // case.
+ *progress_row_cache =
+ (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
+ return !abort_;
+ }
+
+ // Waits until the entire frame has been decoded.
+ bool WaitUntilDecoded() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (frame_state_ != kFrameStateDecoded && !abort_) {
+ decoded_condvar_.wait(lock);
+ }
+ return !abort_;
+ }
+
+ private:
+ friend class BufferPool;
+
+ // Methods for BufferPool:
+ RefCountedBuffer();
+ ~RefCountedBuffer();
+ void SetBufferPool(BufferPool* pool);
+ static void ReturnToBufferPool(RefCountedBuffer* ptr);
+
+ BufferPool* pool_ = nullptr;
+ bool buffer_private_data_valid_ = false;
+ void* buffer_private_data_ = nullptr;
+ YuvBuffer yuv_buffer_;
+ bool in_use_ = false; // Only used by BufferPool.
+
+ std::mutex mutex_;
+ FrameState frame_state_ = kFrameStateUnknown LIBGAV1_GUARDED_BY(mutex_);
+ int progress_row_ = -1 LIBGAV1_GUARDED_BY(mutex_);
+ // Signaled when progress_row_ is updated or when frame_state_ is set to
+ // kFrameStateDecoded.
+ std::condition_variable progress_row_condvar_;
+ // Signaled when the frame state is set to kFrameStateParsed.
+ std::condition_variable parsed_condvar_;
+ // Signaled when the frame state is set to kFrameStateDecoded.
+ std::condition_variable decoded_condvar_;
+ bool abort_ = false LIBGAV1_GUARDED_BY(mutex_);
+
+ FrameType frame_type_ = kFrameKey;
+ ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
+ bool showable_frame_ = false;
+
+ int32_t upscaled_width_ = 0;
+ int32_t frame_width_ = 0;
+ int32_t frame_height_ = 0;
+ int32_t render_width_ = 0;
+ int32_t render_height_ = 0;
+ int32_t columns4x4_ = 0;
+ int32_t rows4x4_ = 0;
+ int spatial_id_ = 0;
+ int temporal_id_ = 0;
+
+ // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
+ SegmentationMap segmentation_map_;
+
+ // Only the |params| field of each GlobalMotion struct is used.
+ // global_motion_[0] (for kReferenceFrameIntra) is not used.
+ std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion_ = {};
+ SymbolDecoderContext frame_context_;
+ std::array<int8_t, kNumReferenceFrameTypes> loop_filter_ref_deltas_;
+ std::array<int8_t, kLoopFilterMaxModeDeltas> loop_filter_mode_deltas_;
+ // Only the feature_enabled, feature_data, segment_id_pre_skip, and
+ // last_active_segment_id fields of the Segmentation struct are used.
+ //
+ // Note: The spec only requires that we save feature_enabled and
+ // feature_data. Since segment_id_pre_skip and last_active_segment_id depend
+ // on feature_enabled only, we also save their values as an optimization.
+ Segmentation segmentation_ = {};
+ FilmGrainParams film_grain_params_ = {};
+ ReferenceInfo reference_info_;
+};
+
+// RefCountedBufferPtr contains a reference to a RefCountedBuffer.
+//
+// Note: For simplicity, RefCountedBufferPtr is implemented as a
+// std::shared_ptr<RefCountedBuffer>. This requires a heap allocation of the
+// control block for std::shared_ptr. To avoid that heap allocation, we can
+// add a |ref_count_| field to RefCountedBuffer and implement a custom
+// RefCountedBufferPtr class.
+using RefCountedBufferPtr = std::shared_ptr<RefCountedBuffer>;
+
+// BufferPool maintains a pool of RefCountedBuffers.
+class BufferPool {
+ public:
+ BufferPool(FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+ GetFrameBufferCallback get_frame_buffer,
+ ReleaseFrameBufferCallback release_frame_buffer,
+ void* callback_private_data);
+
+ // Not copyable or movable.
+ BufferPool(const BufferPool&) = delete;
+ BufferPool& operator=(const BufferPool&) = delete;
+
+ ~BufferPool();
+
+ LIBGAV1_MUST_USE_RESULT bool OnFrameBufferSizeChanged(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border);
+
+ // Finds a free buffer in the buffer pool and returns a reference to the free
+ // buffer. If there is no free buffer, returns a null pointer. This function
+ // is thread safe.
+ RefCountedBufferPtr GetFreeBuffer();
+
+ // Aborts all the buffers that are in use.
+ void Abort();
+
+ private:
+ friend class RefCountedBuffer;
+
+ // Returns an unused buffer to the buffer pool. Called by RefCountedBuffer
+ // only. This function is thread safe.
+ void ReturnUnusedBuffer(RefCountedBuffer* buffer);
+
+ // Used to make the following functions thread safe: GetFreeBuffer(),
+ // ReturnUnusedBuffer(), RefCountedBuffer::Realloc().
+ std::mutex mutex_;
+
+ // Storing a RefCountedBuffer object in a Vector is complicated because of the
+ // copy/move semantics. So the simplest way around that is to store a list of
+ // pointers in the vector.
+ Vector<RefCountedBuffer*> buffers_ LIBGAV1_GUARDED_BY(mutex_);
+ InternalFrameBufferList internal_frame_buffers_;
+
+ // Frame buffer callbacks.
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
+ GetFrameBufferCallback get_frame_buffer_;
+ ReleaseFrameBufferCallback release_frame_buffer_;
+ // Private data associated with the frame buffer callbacks.
+ void* callback_private_data_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_BUFFER_POOL_H_
diff --git a/src/decoder.cc b/src/decoder.cc
new file mode 100644
index 0000000..b9e43e0
--- /dev/null
+++ b/src/decoder.cc
@@ -0,0 +1,119 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder.h"
+
+#include <memory>
+#include <new>
+
+#include "src/decoder_impl.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1DecoderCreate(const Libgav1DecoderSettings* settings,
+ Libgav1Decoder** decoder_out) {
+ std::unique_ptr<libgav1::Decoder> cxx_decoder(new (std::nothrow)
+ libgav1::Decoder());
+ if (cxx_decoder == nullptr) return kLibgav1StatusOutOfMemory;
+
+ libgav1::DecoderSettings cxx_settings;
+ cxx_settings.threads = settings->threads;
+ cxx_settings.frame_parallel = settings->frame_parallel != 0;
+ cxx_settings.blocking_dequeue = settings->blocking_dequeue != 0;
+ cxx_settings.on_frame_buffer_size_changed =
+ settings->on_frame_buffer_size_changed;
+ cxx_settings.get_frame_buffer = settings->get_frame_buffer;
+ cxx_settings.release_frame_buffer = settings->release_frame_buffer;
+ cxx_settings.release_input_buffer = settings->release_input_buffer;
+ cxx_settings.callback_private_data = settings->callback_private_data;
+ cxx_settings.output_all_layers = settings->output_all_layers != 0;
+ cxx_settings.operating_point = settings->operating_point;
+ cxx_settings.post_filter_mask = settings->post_filter_mask;
+
+ const Libgav1StatusCode status = cxx_decoder->Init(&cxx_settings);
+ if (status == kLibgav1StatusOk) {
+ *decoder_out = reinterpret_cast<Libgav1Decoder*>(cxx_decoder.release());
+ }
+ return status;
+}
+
+void Libgav1DecoderDestroy(Libgav1Decoder* decoder) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ delete cxx_decoder;
+}
+
+Libgav1StatusCode Libgav1DecoderEnqueueFrame(Libgav1Decoder* decoder,
+ const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->EnqueueFrame(data, size, user_private_data,
+ buffer_private_data);
+}
+
+Libgav1StatusCode Libgav1DecoderDequeueFrame(
+ Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->DequeueFrame(out_ptr);
+}
+
+Libgav1StatusCode Libgav1DecoderSignalEOS(Libgav1Decoder* decoder) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->SignalEOS();
+}
+
+int Libgav1DecoderGetMaxBitdepth() {
+ return libgav1::Decoder::GetMaxBitdepth();
+}
+
+} // extern "C"
+
+namespace libgav1 {
+
+Decoder::Decoder() = default;
+
+Decoder::~Decoder() = default;
+
+StatusCode Decoder::Init(const DecoderSettings* const settings) {
+ if (impl_ != nullptr) return kStatusAlready;
+ if (settings != nullptr) settings_ = *settings;
+ return DecoderImpl::Create(&settings_, &impl_);
+}
+
+StatusCode Decoder::EnqueueFrame(const uint8_t* data, const size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ if (impl_ == nullptr) return kStatusNotInitialized;
+ return impl_->EnqueueFrame(data, size, user_private_data,
+ buffer_private_data);
+}
+
+StatusCode Decoder::DequeueFrame(const DecoderBuffer** out_ptr) {
+ if (impl_ == nullptr) return kStatusNotInitialized;
+ return impl_->DequeueFrame(out_ptr);
+}
+
+StatusCode Decoder::SignalEOS() {
+ if (impl_ == nullptr) return kStatusNotInitialized;
+ // In non-frame-parallel mode, we have to release all the references. This
+ // simply means replacing the |impl_| with a new instance so that all the
+ // existing references are released and the state is cleared.
+ impl_ = nullptr;
+ return DecoderImpl::Create(&settings_, &impl_);
+}
+
+// static.
+int Decoder::GetMaxBitdepth() { return DecoderImpl::GetMaxBitdepth(); }
+
+} // namespace libgav1
diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc
new file mode 100644
index 0000000..751671d
--- /dev/null
+++ b/src/decoder_impl.cc
@@ -0,0 +1,1661 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/decoder_impl.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <iterator>
+#include <new>
+#include <utility>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/film_grain.h"
+#include "src/frame_buffer_utils.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/prediction_mask.h"
+#include "src/threading_strategy.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/parameter_tree.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaxBlockWidth4x4 = 32;
+constexpr int kMaxBlockHeight4x4 = 32;
+
+// Computes the bottom border size in pixels. If CDEF, loop restoration or
+// SuperRes is enabled, adds extra border pixels to facilitate those steps to
+// happen nearly in-place (a few extra rows instead of an entire frame buffer).
+// The logic in this function should match the corresponding logic for
+// |vertical_shift| in the PostFilter constructor.
+int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
+ const bool do_superres, const int subsampling_y) {
+ int extra_border = 0;
+ if (do_cdef) {
+ extra_border += kCdefBorder;
+ } else if (do_restoration) {
+ // If CDEF is enabled, loop restoration is safe without extra border.
+ extra_border += kRestorationVerticalBorder;
+ }
+ if (do_superres) extra_border += kSuperResVerticalBorder;
+ // Double the number of extra bottom border pixels if the bottom border will
+ // be subsampled.
+ extra_border <<= subsampling_y;
+ return Align(kBorderPixels + extra_border, 2); // Must be a multiple of 2.
+}
+
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+ int count) {
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ frame_scratch_buffer->tile_decoding_failed = true;
+ }
+ std::condition_variable* const condvars =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ for (int i = 0; i < count; ++i) {
+ condvars[i].notify_one();
+ }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
+ public:
+ FrameScratchBufferReleaser(
+ FrameScratchBufferPool* frame_scratch_buffer_pool,
+ std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+ : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+ frame_scratch_buffer_(frame_scratch_buffer) {}
+ ~FrameScratchBufferReleaser() {
+ frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+ }
+
+ private:
+ FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+ std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
+};
+
+// Sets the |frame|'s segmentation map for two cases. The third case is handled
+// in Tile::DecodeBlock().
+void SetSegmentationMap(const ObuFrameHeader& frame_header,
+ const SegmentationMap* prev_segment_ids,
+ RefCountedBuffer* const frame) {
+ if (!frame_header.segmentation.enabled) {
+ // All segment_id's are 0.
+ frame->segmentation_map()->Clear();
+ } else if (!frame_header.segmentation.update_map) {
+ // Copy from prev_segment_ids.
+ if (prev_segment_ids == nullptr) {
+ // Treat a null prev_segment_ids pointer as if it pointed to a
+ // segmentation map containing all 0s.
+ frame->segmentation_map()->Clear();
+ } else {
+ frame->segmentation_map()->CopyFrom(*prev_segment_ids);
+ }
+ }
+}
+
+StatusCode DecodeTilesNonFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter) {
+ // Decode in superblock row order.
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory;
+ for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4) {
+ for (const auto& tile_ptr : tiles) {
+ if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ row4x4, tile_scratch_buffer.get())) {
+ return kLibgav1StatusUnknownError;
+ }
+ }
+ post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/true);
+ }
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(tile_scratch_buffer));
+ return kStatusOk;
+}
+
+StatusCode DecodeTilesThreadedNonFrameParallel(
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter,
+ BlockingCounterWithStatus* const pending_tiles) {
+ ThreadingStrategy& threading_strategy =
+ frame_scratch_buffer->threading_strategy;
+ const int num_workers = threading_strategy.tile_thread_count();
+ BlockingCounterWithStatus pending_workers(num_workers);
+ std::atomic<int> tile_counter(0);
+ const int tile_count = static_cast<int>(tiles.size());
+ bool tile_decoding_failed = false;
+ // Submit tile decoding jobs to the thread pool.
+ for (int i = 0; i < num_workers; ++i) {
+ threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count,
+ &tile_counter,
+ &pending_workers,
+ &pending_tiles]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->ParseAndDecode()) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ } else {
+ pending_tiles->Decrement(false);
+ }
+ }
+ pending_workers.Decrement(!failed);
+ });
+ }
+ // Have the current thread partake in tile decoding.
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!tile_decoding_failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->ParseAndDecode()) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ tile_decoding_failed = true;
+ }
+ } else {
+ pending_tiles->Decrement(false);
+ }
+ }
+ // Wait until all the workers are done. This ensures that all the tiles have
+ // been parsed.
+ tile_decoding_failed |= !pending_workers.Wait();
+ // Wait until all the tiles have been decoded.
+ tile_decoding_failed |= !pending_tiles->Wait();
+ if (tile_decoding_failed) return kStatusUnknownError;
+ assert(threading_strategy.post_filter_thread_pool() != nullptr);
+ post_filter->ApplyFilteringThreaded();
+ return kStatusOk;
+}
+
+StatusCode DecodeTilesFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ const SymbolDecoderContext& saved_symbol_decoder_context,
+ const SegmentationMap* const prev_segment_ids,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+ // Parse the frame.
+ for (const auto& tile : tiles) {
+ if (!tile->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number());
+ return kStatusUnknownError;
+ }
+ }
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ // Mark frame as parsed.
+ current_frame->SetFrameState(kFrameStateParsed);
+ std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (tile_scratch_buffer == nullptr) {
+ return kStatusOutOfMemory;
+ }
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ // Decode in superblock row order (inter prediction in the Tile class will
+ // block until the required superblocks in the reference frame are decoded).
+ for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4) {
+ for (const auto& tile_ptr : tiles) {
+ if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, tile_scratch_buffer.get())) {
+ LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n",
+ tile_ptr->number());
+ return kStatusUnknownError;
+ }
+ }
+ const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/true);
+ if (progress_row >= 0) {
+ current_frame->SetProgress(progress_row);
+ }
+ }
+ // Mark frame as decoded (we no longer care about row-level progress since the
+ // entire frame has been decoded).
+ current_frame->SetFrameState(kFrameStateDecoded);
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(tile_scratch_buffer));
+ return kStatusOk;
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+// deblocking filter for tile boundaries for the superblock row at |row4x4|.
+void ApplyDeblockingFilterForTileBoundaries(
+ PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+ const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+ int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+ // Apply vertical deblock filtering for the first 64 columns of each tile.
+ for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+ const Tile& tile = *tile_row_base[tile_column];
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ if (decode_entire_tiles_in_worker_threads &&
+ row4x4 == tile_row_base[0]->row4x4_start()) {
+ // This is the first superblock row of a tile row. In this case, apply
+ // horizontal deblock filtering for the entire superblock row.
+ post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+ frame_header.columns4x4, block_width4x4);
+ } else {
+ // Apply horizontal deblock filtering for the first 64 columns of the
+ // first tile.
+ const Tile& first_tile = *tile_row_base[0];
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+ first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ // Apply horizontal deblock filtering for the last 64 columns of the
+ // previous tile and the first 64 columns of the current tile.
+ for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+ const Tile& tile = *tile_row_base[tile_column];
+ // If the previous tile has more than 64 columns, then include those
+ // for the horizontal deblock.
+ const Tile& previous_tile = *tile_row_base[tile_column - 1];
+ const int column4x4_start =
+ tile.column4x4_start() -
+ ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+ previous_tile.column4x4_start())
+ ? kNum4x4InLoopFilterUnit
+ : 0);
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ // Apply horizontal deblock filtering for the last 64 columns of the
+ // last tile.
+ const Tile& last_tile = *tile_row_base[tile_columns - 1];
+ // Identify the last column4x4 value and do horizontal filtering for
+ // that column4x4. The value of last column4x4 is the nearest multiple
+ // of 16 that is before tile.column4x4_end().
+ const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+ // If column4x4_start is the same as tile.column4x4_start() then it
+ // means that the last tile has <= 64 columns. So there is nothing left
+ // to deblock (since it was already deblocked in the loop above).
+ if (column4x4_start != last_tile.column4x4_start()) {
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+ last_tile.column4x4_end(), block_width4x4);
+ }
+ }
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+// superblock row starting at |row4x4| for tile at index |tile_index| in the
+// list of tiles |tiles|. If the decoding is successful, then it does the
+// following:
+// * Schedule the next superblock row in the current tile column for decoding
+// (the next superblock row may be in a different tile than the current
+// one).
+// * If an entire superblock row of the frame has been decoded, it notifies
+// the waiters (if there are any).
+void DecodeSuperBlockRowInTile(
+ const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+ const int superblock_size4x4, const int tile_columns,
+ const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (scratch_buffer == nullptr) {
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ return;
+ }
+ Tile& tile = *tiles[tile_index];
+ const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, scratch_buffer.get());
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(scratch_buffer));
+ if (!ok) {
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ return;
+ }
+ if (post_filter->DoDeblock()) {
+ // Apply vertical deblock filtering for all the columns in this tile except
+ // for the first 64 columns.
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+ superblock_size4x4);
+ // Apply horizontal deblock filtering for all the columns in this tile
+ // except for the first and the last 64 columns.
+ // Note about the last tile of each row: For the last tile, column4x4_end
+ // may not be a multiple of 16. In that case it is still okay to simply
+ // subtract 16 since ApplyDeblockFilter() will only do the filters in
+ // increments of 64 columns (or 32 columns for chroma with subsampling).
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+ tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+ }
+ const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+ const int index = row4x4 >> superblock_size4x4_log2;
+ int* const superblock_row_progress =
+ frame_scratch_buffer->superblock_row_progress.get();
+ std::condition_variable* const superblock_row_progress_condvar =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ bool notify;
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ notify = ++superblock_row_progress[index] == tile_columns;
+ }
+ if (notify) {
+ // We are done decoding this superblock row. Notify the post filtering
+ // thread.
+ superblock_row_progress_condvar[index].notify_one();
+ }
+ // Schedule the next superblock row (if one exists).
+ ThreadPool& thread_pool =
+ *frame_scratch_buffer->threading_strategy.thread_pool();
+ const int next_row4x4 = row4x4 + superblock_size4x4;
+ if (!tile.IsRow4x4Inside(next_row4x4)) {
+ tile_index += tile_columns;
+ }
+ if (tile_index >= tiles.size()) return;
+ pending_jobs->IncrementBy(1);
+ thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4,
+ tile_columns, superblock_rows, frame_scratch_buffer,
+ post_filter, pending_jobs]() {
+ DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+ superblock_size4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, pending_jobs);
+ pending_jobs->Decrement();
+ });
+}
+
+StatusCode DecodeTilesThreadedFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ const SymbolDecoderContext& saved_symbol_decoder_context,
+ const SegmentationMap* const prev_segment_ids,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+ // Parse the frame.
+ ThreadPool& thread_pool =
+ *frame_scratch_buffer->threading_strategy.thread_pool();
+ std::atomic<int> tile_counter(0);
+ const int tile_count = static_cast<int>(tiles.size());
+ const int num_workers = thread_pool.num_threads();
+ BlockingCounterWithStatus parse_workers(num_workers);
+ // Submit tile parsing jobs to the thread pool.
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ }
+ }
+ parse_workers.Decrement(!failed);
+ });
+ }
+
+ // Have the current thread participate in parsing.
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ }
+ }
+
+ // Wait until all the parse workers are done. This ensures that all the tiles
+ // have been parsed.
+ if (!parse_workers.Wait() || failed) {
+ return kLibgav1StatusUnknownError;
+ }
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ current_frame->SetFrameState(kFrameStateParsed);
+
+ // Decode the frame.
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ const int block_width4x4_log2 =
+ sequence_header.use_128x128_superblock ? 5 : 4;
+ const int superblock_rows =
+ (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+ if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+ !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+ superblock_rows)) {
+ return kLibgav1StatusOutOfMemory;
+ }
+ int* const superblock_row_progress =
+ frame_scratch_buffer->superblock_row_progress.get();
+ memset(superblock_row_progress, 0,
+ superblock_rows * sizeof(superblock_row_progress[0]));
+ frame_scratch_buffer->tile_decoding_failed = false;
+ const int tile_columns = frame_header.tile_info.tile_columns;
+ const bool decode_entire_tiles_in_worker_threads =
+ num_workers >= tile_columns;
+ BlockingCounter pending_jobs(
+ decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+ if (decode_entire_tiles_in_worker_threads) {
+ // Submit tile decoding jobs to the thread pool.
+ tile_counter = 0;
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+ frame_scratch_buffer, superblock_rows]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (failed) continue;
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Decode(
+ &frame_scratch_buffer->superblock_row_mutex,
+ frame_scratch_buffer->superblock_row_progress.get(),
+ frame_scratch_buffer->superblock_row_progress_condvar
+ .get())) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ failed = true;
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ }
+ }
+ pending_jobs.Decrement();
+ });
+ }
+ } else {
+ // Schedule the jobs for first tile row.
+ for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+ thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns,
+ superblock_rows, frame_scratch_buffer, post_filter,
+ &pending_jobs]() {
+ DecodeSuperBlockRowInTile(
+ tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, &pending_jobs);
+ pending_jobs.Decrement();
+ });
+ }
+ }
+
+ // Current thread will do the post filters.
+ std::condition_variable* const superblock_row_progress_condvar =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+ for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4, ++index) {
+ if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+ tile_row_base += tile_columns;
+ }
+ {
+ std::unique_lock<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ while (superblock_row_progress[index] != tile_columns &&
+ !frame_scratch_buffer->tile_decoding_failed) {
+ superblock_row_progress_condvar[index].wait(lock);
+ }
+ if (frame_scratch_buffer->tile_decoding_failed) break;
+ }
+ if (post_filter->DoDeblock()) {
+ // Apply deblocking filter for the tile boundaries of this superblock row.
+ // The deblocking filter for the internal blocks will be applied in the
+ // tile worker threads. In this thread, we will only have to apply
+ // deblocking filter for the tile boundaries.
+ ApplyDeblockingFilterForTileBoundaries(
+ post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+ tile_columns, decode_entire_tiles_in_worker_threads);
+ }
+ // Apply all the post filters other than deblocking.
+ const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/false);
+ if (progress_row >= 0) {
+ current_frame->SetProgress(progress_row);
+ }
+ }
+ // Wait until all the pending jobs are done. This ensures that all the tiles
+ // have been decoded and wrapped up.
+ pending_jobs.Wait();
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ if (frame_scratch_buffer->tile_decoding_failed) {
+ return kLibgav1StatusUnknownError;
+ }
+ }
+
+ current_frame->SetFrameState(kFrameStateDecoded);
+ return kStatusOk;
+}
+
+} // namespace
+
+// static
+StatusCode DecoderImpl::Create(const DecoderSettings* settings,
+ std::unique_ptr<DecoderImpl>* output) {
+ if (settings->threads <= 0) {
+ LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads);
+ return kStatusInvalidArgument;
+ }
+ if (settings->frame_parallel) {
+ if (settings->release_input_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR,
+ "release_input_buffer callback must not be null when "
+ "frame_parallel is true.");
+ return kStatusInvalidArgument;
+ }
+ }
+ std::unique_ptr<DecoderImpl> impl(new (std::nothrow) DecoderImpl(settings));
+ if (impl == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl.");
+ return kStatusOutOfMemory;
+ }
+ const StatusCode status = impl->Init();
+ if (status != kStatusOk) return status;
+ *output = std::move(impl);
+ return kStatusOk;
+}
+
+DecoderImpl::DecoderImpl(const DecoderSettings* settings)
+ : buffer_pool_(settings->on_frame_buffer_size_changed,
+ settings->get_frame_buffer, settings->release_frame_buffer,
+ settings->callback_private_data),
+ settings_(*settings) {
+ dsp::DspInit();
+}
+
+DecoderImpl::~DecoderImpl() {
+ // Clean up and wait until all the threads have stopped. We just have to pass
+ // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the
+ // path that clears all the threads and structs.
+ SignalFailure(kStatusUnknownError);
+ // Release any other frame buffer references that we may be holding on to.
+ ReleaseOutputFrame();
+ output_frame_queue_.Clear();
+ for (auto& reference_frame : state_.reference_frame) {
+ reference_frame = nullptr;
+ }
+}
+
+StatusCode DecoderImpl::Init() {
+ if (!GenerateWedgeMask(&wedge_masks_)) {
+ LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (!output_frame_queue_.Init(kMaxLayers)) {
+ LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+ const uint8_t* data, size_t size) {
+ is_frame_parallel_ = false;
+ if (settings_.frame_parallel) {
+ DecoderState state;
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ data, size, settings_.operating_point, &buffer_pool_, &state));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ RefCountedBufferPtr current_frame;
+ const StatusCode status = obu->ParseOneFrame(&current_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ current_frame = nullptr;
+ // We assume that the first frame that was parsed will contain the frame
+ // header. This assumption is usually true in practice. So we will simply
+ // not use frame parallel mode if this is not the case.
+ if (settings_.threads > 1 &&
+ !InitializeThreadPoolsForFrameParallel(
+ settings_.threads, obu->frame_header().tile_info.tile_count,
+ obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+ &frame_scratch_buffer_pool_)) {
+ return kStatusOutOfMemory;
+ }
+ }
+ const int max_allowed_frames =
+ (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1;
+ assert(max_allowed_frames > 0);
+ if (!temporal_units_.Init(max_allowed_frames)) {
+ LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
+ return kStatusOutOfMemory;
+ }
+ is_frame_parallel_ = frame_thread_pool_ != nullptr;
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ if (data == nullptr || size == 0) return kStatusInvalidArgument;
+ if (HasFailure()) return kStatusUnknownError;
+ if (!seen_first_frame_) {
+ seen_first_frame_ = true;
+ const StatusCode status =
+ InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+ if (status != kStatusOk) {
+ return SignalFailure(status);
+ }
+ }
+ if (temporal_units_.Full()) {
+ return kStatusTryAgain;
+ }
+ if (is_frame_parallel_) {
+ return ParseAndSchedule(data, size, user_private_data, buffer_private_data);
+ }
+ TemporalUnit temporal_unit(data, size, user_private_data,
+ buffer_private_data);
+ temporal_units_.Push(std::move(temporal_unit));
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::SignalFailure(StatusCode status) {
+ if (status == kStatusOk || status == kStatusTryAgain) return status;
+ // Set the |failure_status_| first so that any pending jobs in
+ // |frame_thread_pool_| will exit right away when the thread pool is being
+ // released below.
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ failure_status_ = status;
+ }
+ // Make sure all waiting threads exit.
+ buffer_pool_.Abort();
+ frame_thread_pool_ = nullptr;
+ while (!temporal_units_.Empty()) {
+ if (settings_.release_input_buffer != nullptr) {
+ settings_.release_input_buffer(
+ settings_.callback_private_data,
+ temporal_units_.Front().buffer_private_data);
+ }
+ temporal_units_.Pop();
+ }
+ return status;
+}
+
+// DequeueFrame() follows the following policy to avoid holding unnecessary
+// frame buffer references in output_frame_: output_frame_ must be null when
+// DequeueFrame() returns false.
+StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
+ if (out_ptr == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr.");
+ return kStatusInvalidArgument;
+ }
+ // We assume a call to DequeueFrame() indicates that the caller is no longer
+ // using the previous output frame, so we can release it.
+ ReleaseOutputFrame();
+ if (temporal_units_.Empty()) {
+ // No input frames to decode.
+ *out_ptr = nullptr;
+ return kStatusNothingToDequeue;
+ }
+ TemporalUnit& temporal_unit = temporal_units_.Front();
+ if (!is_frame_parallel_) {
+ // If |output_frame_queue_| is not empty, then return the first frame from
+ // that queue.
+ if (!output_frame_queue_.Empty()) {
+ RefCountedBufferPtr frame = std::move(output_frame_queue_.Front());
+ output_frame_queue_.Pop();
+ buffer_.user_private_data = temporal_unit.user_private_data;
+ if (output_frame_queue_.Empty()) {
+ temporal_units_.Pop();
+ }
+ const StatusCode status = CopyFrameToOutputBuffer(frame);
+ if (status != kStatusOk) {
+ return status;
+ }
+ *out_ptr = &buffer_;
+ return kStatusOk;
+ }
+ // Decode the next available temporal unit and return.
+ const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr);
+ if (status != kStatusOk) {
+ // In case of failure, discard all the output frames that we may be
+ // holding on references to.
+ output_frame_queue_.Clear();
+ }
+ if (settings_.release_input_buffer != nullptr) {
+ settings_.release_input_buffer(settings_.callback_private_data,
+ temporal_unit.buffer_private_data);
+ }
+ if (output_frame_queue_.Empty()) {
+ temporal_units_.Pop();
+ }
+ return status;
+ }
+ {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (settings_.blocking_dequeue) {
+ while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+ decoded_condvar_.wait(lock);
+ }
+ } else {
+ if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+ return kStatusTryAgain;
+ }
+ }
+ if (failure_status_ != kStatusOk) {
+ const StatusCode failure_status = failure_status_;
+ lock.unlock();
+ return SignalFailure(failure_status);
+ }
+ }
+ if (settings_.release_input_buffer != nullptr &&
+ !temporal_unit.released_input_buffer) {
+ temporal_unit.released_input_buffer = true;
+ settings_.release_input_buffer(settings_.callback_private_data,
+ temporal_unit.buffer_private_data);
+ }
+ if (temporal_unit.status != kStatusOk) {
+ temporal_units_.Pop();
+ return SignalFailure(temporal_unit.status);
+ }
+ if (!temporal_unit.has_displayable_frame) {
+ *out_ptr = nullptr;
+ temporal_units_.Pop();
+ return kStatusOk;
+ }
+ assert(temporal_unit.output_layer_count > 0);
+ StatusCode status = CopyFrameToOutputBuffer(
+ temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame);
+ temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame =
+ nullptr;
+ if (status != kStatusOk) {
+ temporal_units_.Pop();
+ return SignalFailure(status);
+ }
+ buffer_.user_private_data = temporal_unit.user_private_data;
+ *out_ptr = &buffer_;
+ if (--temporal_unit.output_layer_count == 0) {
+ temporal_units_.Pop();
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ TemporalUnit temporal_unit(data, size, user_private_data,
+ buffer_private_data);
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ temporal_unit.data, temporal_unit.size, settings_.operating_point,
+ &buffer_pool_, &state_));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ if (has_sequence_header_) {
+ obu->set_sequence_header(sequence_header_);
+ }
+ StatusCode status;
+ int position_in_temporal_unit = 0;
+ while (obu->HasData()) {
+ RefCountedBufferPtr current_frame;
+ status = obu->ParseOneFrame(&current_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+ LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (IsNewSequenceHeader(*obu)) {
+ const ObuSequenceHeader& sequence_header = obu->sequence_header();
+ const Libgav1ImageFormat image_format =
+ ComposeImageFormat(sequence_header.color_config.is_monochrome,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y);
+ const int max_bottom_border = GetBottomBorderPixels(
+ /*do_cdef=*/true, /*do_restoration=*/true,
+ /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+ // TODO(vigneshv): This may not be the right place to call this callback
+ // for the frame parallel case. Investigate and fix it.
+ if (!buffer_pool_.OnFrameBufferSizeChanged(
+ sequence_header.color_config.bitdepth, image_format,
+ sequence_header.max_frame_width, sequence_header.max_frame_height,
+ kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+ LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+ return kStatusUnknownError;
+ }
+ }
+ // This can happen when there are multiple spatial/temporal layers and if
+ // all the layers are outside the current operating point.
+ if (current_frame == nullptr) {
+ continue;
+ }
+ // Note that we cannot set EncodedFrame.temporal_unit here. It will be set
+ // in the code below after |temporal_unit| is std::move'd into the
+ // |temporal_units_| queue.
+ if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame,
+ position_in_temporal_unit++)) {
+ LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed.");
+ return kStatusOutOfMemory;
+ }
+ state_.UpdateReferenceFrames(current_frame,
+ obu->frame_header().refresh_frame_flags);
+ }
+ // This function cannot fail after this point. So it is okay to move the
+ // |temporal_unit| into |temporal_units_| queue.
+ temporal_units_.Push(std::move(temporal_unit));
+ if (temporal_units_.Back().frames.empty()) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ temporal_units_.Back().has_displayable_frame = false;
+ temporal_units_.Back().decoded = true;
+ return kStatusOk;
+ }
+ for (auto& frame : temporal_units_.Back().frames) {
+ EncodedFrame* const encoded_frame = &frame;
+ encoded_frame->temporal_unit = &temporal_units_.Back();
+ frame_thread_pool_->Schedule([this, encoded_frame]() {
+ if (HasFailure()) return;
+ const StatusCode status = DecodeFrame(encoded_frame);
+ encoded_frame->state = {};
+ encoded_frame->frame = nullptr;
+ TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (failure_status_ != kStatusOk) return;
+ // temporal_unit's status defaults to kStatusOk. So we need to set it only
+ // on error. If |failure_status_| is not kStatusOk at this point, it means
+ // that there has already been a failure. So we don't care about this
+ // subsequent failure. We will simply return the error code of the first
+ // failure.
+ if (status != kStatusOk) {
+ temporal_unit.status = status;
+ if (failure_status_ == kStatusOk) {
+ failure_status_ = status;
+ }
+ }
+ temporal_unit.decoded =
+ ++temporal_unit.decoded_count == temporal_unit.frames.size();
+ if (temporal_unit.decoded && settings_.output_all_layers &&
+ temporal_unit.output_layer_count > 1) {
+ std::sort(
+ temporal_unit.output_layers,
+ temporal_unit.output_layers + temporal_unit.output_layer_count);
+ }
+ if (temporal_unit.decoded || failure_status_ != kStatusOk) {
+ decoded_condvar_.notify_one();
+ }
+ });
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
+ const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header;
+ const ObuFrameHeader& frame_header = encoded_frame->frame_header;
+ RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
+
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool_.Get();
+ if (frame_scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+ return kStatusOutOfMemory;
+ }
+ // |frame_scratch_buffer| will be released when this local variable goes out
+ // of scope (i.e.) on any return path in this function.
+ FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+ &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+ StatusCode status;
+ if (!frame_header.show_existing_frame) {
+ if (encoded_frame->tile_buffers.empty()) {
+ // This means that the last call to ParseOneFrame() did not actually
+ // have any tile groups. This could happen in rare cases (for example,
+ // if there is a Metadata OBU after the TileGroup OBU). We currently do
+ // not have a reason to handle those cases, so we simply continue.
+ return kStatusOk;
+ }
+ status = DecodeTiles(sequence_header, frame_header,
+ encoded_frame->tile_buffers, encoded_frame->state,
+ frame_scratch_buffer.get(), current_frame.get());
+ if (status != kStatusOk) {
+ return status;
+ }
+ } else {
+ if (!current_frame->WaitUntilDecoded()) {
+ return kStatusUnknownError;
+ }
+ }
+ if (!frame_header.show_frame && !frame_header.show_existing_frame) {
+ // This frame is not displayable. Not an error.
+ return kStatusOk;
+ }
+ RefCountedBufferPtr film_grain_frame;
+ status = ApplyFilmGrain(
+ sequence_header, frame_header, current_frame, &film_grain_frame,
+ frame_scratch_buffer->threading_strategy.thread_pool());
+ if (status != kStatusOk) {
+ return status;
+ }
+
+ TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) {
+ assert(temporal_unit.output_frame_position >= 0);
+ // A displayable frame was already found in this temporal unit. This can
+ // happen if there are multiple spatial/temporal layers. Since
+ // |settings_.output_all_layers| is false, we will output only the last
+ // displayable frame.
+ if (temporal_unit.output_frame_position >
+ encoded_frame->position_in_temporal_unit) {
+ return kStatusOk;
+ }
+ // Replace any output frame that we may have seen before with the current
+ // frame.
+ assert(temporal_unit.output_layer_count == 1);
+ --temporal_unit.output_layer_count;
+ }
+ temporal_unit.has_displayable_frame = true;
+ temporal_unit.output_layers[temporal_unit.output_layer_count].frame =
+ std::move(film_grain_frame);
+ temporal_unit.output_layers[temporal_unit.output_layer_count]
+ .position_in_temporal_unit = encoded_frame->position_in_temporal_unit;
+ ++temporal_unit.output_layer_count;
+ temporal_unit.output_frame_position =
+ encoded_frame->position_in_temporal_unit;
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+ const DecoderBuffer** out_ptr) {
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ temporal_unit.data, temporal_unit.size, settings_.operating_point,
+ &buffer_pool_, &state_));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ if (has_sequence_header_) {
+ obu->set_sequence_header(sequence_header_);
+ }
+ StatusCode status;
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool_.Get();
+ if (frame_scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+ return kStatusOutOfMemory;
+ }
+ // |frame_scratch_buffer| will be released when this local variable goes out
+ // of scope (i.e.) on any return path in this function.
+ FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+ &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+ while (obu->HasData()) {
+ RefCountedBufferPtr current_frame;
+ status = obu->ParseOneFrame(&current_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+ LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (IsNewSequenceHeader(*obu)) {
+ const ObuSequenceHeader& sequence_header = obu->sequence_header();
+ const Libgav1ImageFormat image_format =
+ ComposeImageFormat(sequence_header.color_config.is_monochrome,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y);
+ const int max_bottom_border = GetBottomBorderPixels(
+ /*do_cdef=*/true, /*do_restoration=*/true,
+ /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+ if (!buffer_pool_.OnFrameBufferSizeChanged(
+ sequence_header.color_config.bitdepth, image_format,
+ sequence_header.max_frame_width, sequence_header.max_frame_height,
+ kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+ LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+ return kStatusUnknownError;
+ }
+ }
+ if (!obu->frame_header().show_existing_frame) {
+ if (obu->tile_buffers().empty()) {
+ // This means that the last call to ParseOneFrame() did not actually
+ // have any tile groups. This could happen in rare cases (for example,
+ // if there is a Metadata OBU after the TileGroup OBU). We currently do
+ // not have a reason to handle those cases, so we simply continue.
+ continue;
+ }
+ status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
+ obu->tile_buffers(), state_,
+ frame_scratch_buffer.get(), current_frame.get());
+ if (status != kStatusOk) {
+ return status;
+ }
+ }
+ state_.UpdateReferenceFrames(current_frame,
+ obu->frame_header().refresh_frame_flags);
+ if (obu->frame_header().show_frame ||
+ obu->frame_header().show_existing_frame) {
+ if (!output_frame_queue_.Empty() && !settings_.output_all_layers) {
+ // There is more than one displayable frame in the current operating
+ // point and |settings_.output_all_layers| is false. In this case, we
+ // simply return the last displayable frame as the output frame and
+ // ignore the rest.
+ assert(output_frame_queue_.Size() == 1);
+ output_frame_queue_.Pop();
+ }
+ RefCountedBufferPtr film_grain_frame;
+ status = ApplyFilmGrain(
+ obu->sequence_header(), obu->frame_header(), current_frame,
+ &film_grain_frame,
+ frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
+ if (status != kStatusOk) return status;
+ output_frame_queue_.Push(std::move(film_grain_frame));
+ }
+ }
+ if (output_frame_queue_.Empty()) {
+ // No displayable frame in the temporal unit. Not an error.
+ *out_ptr = nullptr;
+ return kStatusOk;
+ }
+ status = CopyFrameToOutputBuffer(output_frame_queue_.Front());
+ output_frame_queue_.Pop();
+ if (status != kStatusOk) {
+ return status;
+ }
+ buffer_.user_private_data = temporal_unit.user_private_data;
+ *out_ptr = &buffer_;
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::CopyFrameToOutputBuffer(
+ const RefCountedBufferPtr& frame) {
+ YuvBuffer* yuv_buffer = frame->buffer();
+
+ buffer_.chroma_sample_position = frame->chroma_sample_position();
+
+ if (yuv_buffer->is_monochrome()) {
+ buffer_.image_format = kImageFormatMonochrome400;
+ } else {
+ if (yuv_buffer->subsampling_x() == 0 && yuv_buffer->subsampling_y() == 0) {
+ buffer_.image_format = kImageFormatYuv444;
+ } else if (yuv_buffer->subsampling_x() == 1 &&
+ yuv_buffer->subsampling_y() == 0) {
+ buffer_.image_format = kImageFormatYuv422;
+ } else if (yuv_buffer->subsampling_x() == 1 &&
+ yuv_buffer->subsampling_y() == 1) {
+ buffer_.image_format = kImageFormatYuv420;
+ } else {
+ LIBGAV1_DLOG(ERROR,
+ "Invalid chroma subsampling values: cannot determine buffer "
+ "image format.");
+ return kStatusInvalidArgument;
+ }
+ }
+ buffer_.color_range = sequence_header_.color_config.color_range;
+ buffer_.color_primary = sequence_header_.color_config.color_primary;
+ buffer_.transfer_characteristics =
+ sequence_header_.color_config.transfer_characteristics;
+ buffer_.matrix_coefficients =
+ sequence_header_.color_config.matrix_coefficients;
+
+ buffer_.bitdepth = yuv_buffer->bitdepth();
+ const int num_planes =
+ yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes;
+ int plane = kPlaneY;
+ for (; plane < num_planes; ++plane) {
+ buffer_.stride[plane] = yuv_buffer->stride(plane);
+ buffer_.plane[plane] = yuv_buffer->data(plane);
+ buffer_.displayed_width[plane] = yuv_buffer->width(plane);
+ buffer_.displayed_height[plane] = yuv_buffer->height(plane);
+ }
+ for (; plane < kMaxPlanes; ++plane) {
+ buffer_.stride[plane] = 0;
+ buffer_.plane[plane] = nullptr;
+ buffer_.displayed_width[plane] = 0;
+ buffer_.displayed_height[plane] = 0;
+ }
+ buffer_.spatial_id = frame->spatial_id();
+ buffer_.temporal_id = frame->temporal_id();
+ buffer_.buffer_private_data = frame->buffer_private_data();
+ output_frame_ = frame;
+ return kStatusOk;
+}
+
+void DecoderImpl::ReleaseOutputFrame() {
+ for (auto& plane : buffer_.plane) {
+ plane = nullptr;
+ }
+ output_frame_ = nullptr;
+}
+
+StatusCode DecoderImpl::DecodeTiles(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header, const Vector<TileBuffer>& tile_buffers,
+ const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+ RefCountedBuffer* const current_frame) {
+ frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
+ sequence_header.color_config.bitdepth);
+ if (!frame_scratch_buffer->loop_restoration_info.Reset(
+ &frame_header.loop_restoration, frame_header.upscaled_width,
+ frame_header.height, sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.is_monochrome)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate memory for loop restoration info units.");
+ return kStatusOutOfMemory;
+ }
+ ThreadingStrategy& threading_strategy =
+ frame_scratch_buffer->threading_strategy;
+ if (!is_frame_parallel_ &&
+ !threading_strategy.Reset(frame_header, settings_.threads)) {
+ return kStatusOutOfMemory;
+ }
+ const bool do_cdef =
+ PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
+ const int num_planes = sequence_header.color_config.is_monochrome
+ ? kMaxPlanesMonochrome
+ : kMaxPlanes;
+ const bool do_restoration = PostFilter::DoRestoration(
+ frame_header.loop_restoration, settings_.post_filter_mask, num_planes);
+ const bool do_superres =
+ PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
+ // Use kBorderPixels for the left, right, and top borders. Only the bottom
+ // border may need to be bigger. Cdef border is needed only if we apply Cdef
+ // without multithreading.
+ const int bottom_border = GetBottomBorderPixels(
+ do_cdef && threading_strategy.post_filter_thread_pool() == nullptr,
+ do_restoration, do_superres, sequence_header.color_config.subsampling_y);
+ current_frame->set_chroma_sample_position(
+ sequence_header.color_config.chroma_sample_position);
+ if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ frame_header.upscaled_width, frame_header.height,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ /*left_border=*/kBorderPixels,
+ /*right_border=*/kBorderPixels,
+ /*top_border=*/kBorderPixels, bottom_border)) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
+ return kStatusOutOfMemory;
+ }
+ if (sequence_header.enable_cdef) {
+ if (!frame_scratch_buffer->cdef_index.Reset(
+ DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
+ DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index.");
+ return kStatusOutOfMemory;
+ }
+ }
+ if (!frame_scratch_buffer->inter_transform_sizes.Reset(
+ frame_header.rows4x4 + kMaxBlockHeight4x4,
+ frame_header.columns4x4 + kMaxBlockWidth4x4,
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes.");
+ return kStatusOutOfMemory;
+ }
+ if (frame_header.use_ref_frame_mvs) {
+ if (!frame_scratch_buffer->motion_field.mv.Reset(
+ DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+ /*zero_initialize=*/false) ||
+ !frame_scratch_buffer->motion_field.reference_offset.Reset(
+ DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate memory for temporal motion vectors.");
+ return kStatusOutOfMemory;
+ }
+
+ // For each motion vector, only mv[0] needs to be initialized to
+ // kInvalidMvValue, mv[1] is not necessary to be initialized and can be
+ // set to an arbitrary value. For simplicity, mv[1] is set to 0.
+ // The following memory initialization of contiguous memory is very fast. It
+ // is not recommended to make the initialization multi-threaded, unless the
+ // memory which needs to be initialized in each thread is still contiguous.
+ MotionVector invalid_mv;
+ invalid_mv.mv[0] = kInvalidMvValue;
+ invalid_mv.mv[1] = 0;
+ MotionVector* const motion_field_mv =
+ &frame_scratch_buffer->motion_field.mv[0][0];
+ std::fill(motion_field_mv,
+ motion_field_mv + frame_scratch_buffer->motion_field.mv.size(),
+ invalid_mv);
+ }
+
+ // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
+ // that the block parameters cache can be filled in for the last row/column
+ // without having to check for boundary conditions.
+ if (!frame_scratch_buffer->block_parameters_holder.Reset(
+ frame_header.rows4x4 + kMaxBlockHeight4x4,
+ frame_header.columns4x4 + kMaxBlockWidth4x4,
+ sequence_header.use_128x128_superblock)) {
+ return kStatusOutOfMemory;
+ }
+ const dsp::Dsp* const dsp =
+ dsp::GetDspTable(sequence_header.color_config.bitdepth);
+ if (dsp == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.",
+ sequence_header.color_config.bitdepth);
+ return kStatusInternalError;
+ }
+
+ const int tile_count = frame_header.tile_info.tile_count;
+ assert(tile_count >= 1);
+ Vector<std::unique_ptr<Tile>> tiles;
+ if (!tiles.reserve(tile_count)) {
+ LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
+ return kStatusOutOfMemory;
+ }
+
+ if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
+ if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+ frame_scratch_buffer->residual_buffer_pool.reset(
+ new (std::nothrow) ResidualBufferPool(
+ sequence_header.use_128x128_superblock,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+ : sizeof(int32_t)));
+ if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n");
+ return kStatusOutOfMemory;
+ }
+ } else {
+ frame_scratch_buffer->residual_buffer_pool->Reset(
+ sequence_header.use_128x128_superblock,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+ : sizeof(int32_t));
+ }
+ }
+
+ if (threading_strategy.post_filter_thread_pool() != nullptr && do_cdef) {
+ // We need to store 4 rows per 64x64 unit.
+ const int num_units =
+ MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_units| rows of the loop
+ // restoration border pixels.
+ if (!frame_scratch_buffer->cdef_border.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ MultiplyBy4(frame_header.columns4x4), num_units,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr)) {
+ return kStatusOutOfMemory;
+ }
+ }
+
+ if (do_restoration &&
+ (do_cdef || threading_strategy.post_filter_thread_pool() != nullptr)) {
+ // We need to store 4 rows per 64x64 unit.
+ const int num_units =
+ MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_units| rows of the loop
+ // restoration border pixels.
+ if (!frame_scratch_buffer->loop_restoration_border.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ frame_header.upscaled_width, num_units,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr)) {
+ return kStatusOutOfMemory;
+ }
+ }
+
+ if (do_superres) {
+ const int pixel_size = sequence_header.color_config.bitdepth == 8
+ ? sizeof(uint8_t)
+ : sizeof(uint16_t);
+ if (!frame_scratch_buffer->superres_coefficients[kPlaneTypeY].Resize(
+ kSuperResFilterTaps * Align(frame_header.upscaled_width, 16) *
+ pixel_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to Resize superres_coefficients[kPlaneTypeY].");
+ return kStatusOutOfMemory;
+ }
+ if (!sequence_header.color_config.is_monochrome &&
+ sequence_header.color_config.subsampling_x != 0 &&
+ !frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].Resize(
+ kSuperResFilterTaps *
+ Align(SubsampledValue(frame_header.upscaled_width, 1), 16) *
+ pixel_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to Resize superres_coefficients[kPlaneTypeUV].");
+ return kStatusOutOfMemory;
+ }
+ }
+
+ if (do_superres && threading_strategy.post_filter_thread_pool() != nullptr) {
+ const int num_threads =
+ threading_strategy.post_filter_thread_pool()->num_threads() + 1;
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_threads| rows of the
+ // down-scaled pixels.
+ // Left and right borders are for line extension. They are doubled for the Y
+ // plane to make sure the U and V planes have enough space after possible
+ // subsampling.
+ if (!frame_scratch_buffer->superres_line_buffer.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ MultiplyBy4(frame_header.columns4x4), num_threads,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+ 2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+ nullptr, nullptr, nullptr)) {
+ LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
+ return kStatusOutOfMemory;
+ }
+ }
+
+ PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+ current_frame->buffer(), dsp,
+ settings_.post_filter_mask);
+
+ if (is_frame_parallel_ && !IsIntraFrame(frame_header.frame_type)) {
+ // We can parse the current frame if all the reference frames have been
+ // parsed.
+ for (const int index : frame_header.reference_frame_index) {
+ if (!state.reference_frame[index]->WaitUntilParsed()) {
+ return kStatusUnknownError;
+ }
+ }
+ }
+
+ // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+ // a segmentation map containing all 0s.
+ const SegmentationMap* prev_segment_ids = nullptr;
+ if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+ frame_scratch_buffer->symbol_decoder_context.Initialize(
+ frame_header.quantizer.base_index);
+ } else {
+ const int index =
+ frame_header
+ .reference_frame_index[frame_header.primary_reference_frame];
+ assert(index != -1);
+ const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+ frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+ if (frame_header.segmentation.enabled &&
+ prev_frame->columns4x4() == frame_header.columns4x4 &&
+ prev_frame->rows4x4() == frame_header.rows4x4) {
+ prev_segment_ids = prev_frame->segmentation_map();
+ }
+ }
+
+ // The Tile class must make use of a separate buffer to store the unfiltered
+ // pixels for the intra prediction of the next superblock row. This is done
+ // only when one of the following conditions are true:
+ // * is_frame_parallel_ is true.
+ // * settings_.threads == 1.
+ // In the non-frame-parallel multi-threaded case, we do not run the post
+ // filters in the decode loop. So this buffer need not be used.
+ const bool use_intra_prediction_buffer =
+ is_frame_parallel_ || settings_.threads == 1;
+ if (use_intra_prediction_buffer) {
+ if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+ frame_header.tile_info.tile_rows)) {
+ LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+ return kStatusOutOfMemory;
+ }
+ IntraPredictionBuffer* const intra_prediction_buffers =
+ frame_scratch_buffer->intra_prediction_buffers.get();
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
+ const int subsampling =
+ (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+ const size_t intra_prediction_buffer_size =
+ ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+ (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t)));
+ for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+ ++tile_row) {
+ if (!intra_prediction_buffers[tile_row][plane].Resize(
+ intra_prediction_buffer_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate intra prediction buffer for tile "
+ "row %d plane %d.\n",
+ tile_row, plane);
+ return kStatusOutOfMemory;
+ }
+ }
+ }
+ }
+
+ SymbolDecoderContext saved_symbol_decoder_context;
+ BlockingCounterWithStatus pending_tiles(tile_count);
+ for (int tile_number = 0; tile_number < tile_count; ++tile_number) {
+ std::unique_ptr<Tile> tile = Tile::Create(
+ tile_number, tile_buffers[tile_number].data,
+ tile_buffers[tile_number].size, sequence_header, frame_header,
+ current_frame, state, frame_scratch_buffer, wedge_masks_,
+ quantizer_matrix_, &saved_symbol_decoder_context, prev_segment_ids,
+ &post_filter, dsp, threading_strategy.row_thread_pool(tile_number),
+ &pending_tiles, is_frame_parallel_, use_intra_prediction_buffer);
+ if (tile == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create tile.");
+ return kStatusOutOfMemory;
+ }
+ tiles.push_back_unchecked(std::move(tile));
+ }
+ assert(tiles.size() == static_cast<size_t>(tile_count));
+ if (is_frame_parallel_) {
+ if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+ return DecodeTilesFrameParallel(
+ sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+ prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+ }
+ return DecodeTilesThreadedFrameParallel(
+ sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+ prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+ }
+ StatusCode status;
+ if (settings_.threads == 1) {
+ status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
+ frame_scratch_buffer, &post_filter);
+ } else {
+ status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+ &post_filter, &pending_tiles);
+ }
+ if (status != kStatusOk) return status;
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::ApplyFilmGrain(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const RefCountedBufferPtr& displayable_frame,
+ RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) {
+ if (!sequence_header.film_grain_params_present ||
+ !displayable_frame->film_grain_params().apply_grain ||
+ (settings_.post_filter_mask & 0x10) == 0) {
+ *film_grain_frame = displayable_frame;
+ return kStatusOk;
+ }
+ if (!frame_header.show_existing_frame &&
+ frame_header.refresh_frame_flags == 0) {
+ // If show_existing_frame is true, then the current frame is a previously
+ // saved reference frame. If refresh_frame_flags is nonzero, then the
+ // state_.UpdateReferenceFrames() call above has saved the current frame as
+ // a reference frame. Therefore, if both of these conditions are false, then
+ // the current frame is not saved as a reference frame. displayable_frame
+ // should hold the only reference to the current frame.
+ assert(displayable_frame.use_count() == 1);
+ // Add film grain noise in place.
+ *film_grain_frame = displayable_frame;
+ } else {
+ *film_grain_frame = buffer_pool_.GetFreeBuffer();
+ if (*film_grain_frame == nullptr) {
+ LIBGAV1_DLOG(ERROR,
+ "Could not get film_grain_frame from the buffer pool.");
+ return kStatusResourceExhausted;
+ }
+ if (!(*film_grain_frame)
+ ->Realloc(displayable_frame->buffer()->bitdepth(),
+ displayable_frame->buffer()->is_monochrome(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(),
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ kBorderPixelsFilmGrain, kBorderPixelsFilmGrain,
+ kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) {
+ LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
+ return kStatusOutOfMemory;
+ }
+ (*film_grain_frame)
+ ->set_chroma_sample_position(
+ displayable_frame->chroma_sample_position());
+ (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id());
+ (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id());
+ }
+ const bool color_matrix_is_identity =
+ sequence_header.color_config.matrix_coefficients ==
+ kMatrixCoefficientsIdentity;
+ assert(displayable_frame->buffer()->stride(kPlaneU) ==
+ displayable_frame->buffer()->stride(kPlaneV));
+ const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU);
+ assert((*film_grain_frame)->buffer()->stride(kPlaneU) ==
+ (*film_grain_frame)->buffer()->stride(kPlaneV));
+ const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (displayable_frame->buffer()->bitdepth() > 8) {
+ FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
+ displayable_frame->buffer()->is_monochrome(),
+ color_matrix_is_identity,
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(), thread_pool);
+ if (!film_grain.AddNoise(
+ displayable_frame->buffer()->data(kPlaneY),
+ displayable_frame->buffer()->stride(kPlaneY),
+ displayable_frame->buffer()->data(kPlaneU),
+ displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+ (*film_grain_frame)->buffer()->data(kPlaneY),
+ (*film_grain_frame)->buffer()->stride(kPlaneY),
+ (*film_grain_frame)->buffer()->data(kPlaneU),
+ (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+ LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
+ displayable_frame->buffer()->is_monochrome(),
+ color_matrix_is_identity,
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(), thread_pool);
+ if (!film_grain.AddNoise(
+ displayable_frame->buffer()->data(kPlaneY),
+ displayable_frame->buffer()->stride(kPlaneY),
+ displayable_frame->buffer()->data(kPlaneU),
+ displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+ (*film_grain_frame)->buffer()->data(kPlaneY),
+ (*film_grain_frame)->buffer()->stride(kPlaneY),
+ (*film_grain_frame)->buffer()->data(kPlaneU),
+ (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+ LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+}
+
+bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
+ if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(),
+ [](const ObuHeader& obu_header) {
+ return obu_header.type == kObuSequenceHeader;
+ }) == obu.obu_headers().end()) {
+ return false;
+ }
+ const ObuSequenceHeader sequence_header = obu.sequence_header();
+ const bool sequence_header_changed =
+ !has_sequence_header_ ||
+ sequence_header_.color_config.bitdepth !=
+ sequence_header.color_config.bitdepth ||
+ sequence_header_.color_config.is_monochrome !=
+ sequence_header.color_config.is_monochrome ||
+ sequence_header_.color_config.subsampling_x !=
+ sequence_header.color_config.subsampling_x ||
+ sequence_header_.color_config.subsampling_y !=
+ sequence_header.color_config.subsampling_y ||
+ sequence_header_.max_frame_width != sequence_header.max_frame_width ||
+ sequence_header_.max_frame_height != sequence_header.max_frame_height;
+ sequence_header_ = sequence_header;
+ has_sequence_header_ = true;
+ return sequence_header_changed;
+}
+
+bool DecoderImpl::MaybeInitializeQuantizerMatrix(
+ const ObuFrameHeader& frame_header) {
+ if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
+ return true;
+ }
+ if (!InitializeQuantizerMatrix(&quantizer_matrix_)) {
+ return false;
+ }
+ quantizer_matrix_initialized_ = true;
+ return true;
+}
+
+} // namespace libgav1
diff --git a/src/decoder_impl.h b/src/decoder_impl.h
new file mode 100644
index 0000000..721b666
--- /dev/null
+++ b/src/decoder_impl.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_IMPL_H_
+#define LIBGAV1_SRC_DECODER_IMPL_H_
+
+#include <array>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/constants.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/decoder_settings.h"
+#include "src/gav1/status_code.h"
+#include "src/obu_parser.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/queue.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct TemporalUnit;
+
+struct EncodedFrame {
+ EncodedFrame(ObuParser* const obu, const DecoderState& state,
+ const RefCountedBufferPtr& frame, int position_in_temporal_unit)
+ : sequence_header(obu->sequence_header()),
+ frame_header(obu->frame_header()),
+ state(state),
+ temporal_unit(nullptr),
+ frame(frame),
+ position_in_temporal_unit(position_in_temporal_unit) {
+ obu->MoveTileBuffers(&tile_buffers);
+ frame->MarkFrameAsStarted();
+ }
+
+ const ObuSequenceHeader sequence_header;
+ const ObuFrameHeader frame_header;
+ Vector<TileBuffer> tile_buffers;
+ DecoderState state;
+ TemporalUnit* temporal_unit;
+ RefCountedBufferPtr frame;
+ const int position_in_temporal_unit;
+};
+
+struct TemporalUnit : public Allocable {
+ // The default constructor is invoked by the Queue<TemporalUnit>::Init()
+ // method. Queue<> does not use the default-constructed elements, so it is
+ // safe for the default constructor to not initialize the members.
+ TemporalUnit() = default;
+ TemporalUnit(const uint8_t* data, size_t size, int64_t user_private_data,
+ void* buffer_private_data)
+ : data(data),
+ size(size),
+ user_private_data(user_private_data),
+ buffer_private_data(buffer_private_data),
+ decoded(false),
+ status(kStatusOk),
+ has_displayable_frame(false),
+ output_frame_position(-1),
+ decoded_count(0),
+ output_layer_count(0),
+ released_input_buffer(false) {}
+
+ const uint8_t* data;
+ size_t size;
+ int64_t user_private_data;
+ void* buffer_private_data;
+
+ // The following members are used only in frame parallel mode.
+ bool decoded;
+ StatusCode status;
+ bool has_displayable_frame;
+ int output_frame_position;
+
+ Vector<EncodedFrame> frames;
+ size_t decoded_count;
+
+ // The struct (and the counter) is used to support output of multiple layers
+ // within a single temporal unit. The decoding process will store the output
+ // frames in |output_layers| in the order they are finished decoding. At the
+ // end of the decoding process, this array will be sorted in reverse order of
+ // |position_in_temporal_unit|. DequeueFrame() will then return the frames in
+ // reverse order (so that the entire process can run with a single counter
+ // variable).
+ struct OutputLayer {
+ // Used by std::sort to sort |output_layers| in reverse order of
+ // |position_in_temporal_unit|.
+ bool operator<(const OutputLayer& rhs) const {
+ return position_in_temporal_unit > rhs.position_in_temporal_unit;
+ }
+
+ RefCountedBufferPtr frame;
+ int position_in_temporal_unit = 0;
+ } output_layers[kMaxLayers];
+ // Number of entries in |output_layers|.
+ int output_layer_count;
+ // Flag to ensure that we release the input buffer only once if there are
+ // multiple output layers.
+ bool released_input_buffer;
+};
+
+class DecoderImpl : public Allocable {
+ public:
+ // The constructor saves a const reference to |*settings|. Therefore
+ // |*settings| must outlive the DecoderImpl object. On success, |*output|
+ // contains a pointer to the newly-created DecoderImpl object. On failure,
+ // |*output| is not modified.
+ static StatusCode Create(const DecoderSettings* settings,
+ std::unique_ptr<DecoderImpl>* output);
+ ~DecoderImpl();
+ StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+ int64_t user_private_data, void* buffer_private_data);
+ StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+ static constexpr int GetMaxBitdepth() {
+ static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10,
+ "LIBGAV1_MAX_BITDEPTH must be 8 or 10.");
+ return LIBGAV1_MAX_BITDEPTH;
+ }
+
+ private:
+ explicit DecoderImpl(const DecoderSettings* settings);
+ StatusCode Init();
+ // Called when the first frame is enqueued. It does the OBU parsing for one
+ // temporal unit to retrieve the tile configuration and sets up the frame
+ // threading if frame parallel mode is allowed. It also initializes the
+ // |temporal_units_| queue based on the number of frame threads.
+ //
+ // The following are the limitations of the current implementation:
+ // * It assumes that all frames in the video have the same tile
+ // configuration. The frame parallel threading model will not be updated
+ // based on tile configuration changes mid-stream.
+ // * The above assumption holds true even when there is a new coded video
+ // sequence (i.e.) a new sequence header.
+ StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+ size_t size);
+ // Used only in frame parallel mode. Signals failure and waits until the
+ // worker threads are aborted if |status| is a failure status. If |status| is
+ // equal to kStatusOk or kStatusTryAgain, this function does not do anything.
+ // Always returns the input parameter |status| as the return value.
+ //
+ // This function is called only from the application thread (from
+ // EnqueueFrame() and DequeueFrame()).
+ StatusCode SignalFailure(StatusCode status);
+
+ void ReleaseOutputFrame();
+
+ // Decodes all the frames contained in the given temporal unit. Used only in
+ // non frame parallel mode.
+ StatusCode DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+ const DecoderBuffer** out_ptr);
+ // Used only in frame parallel mode. Does the OBU parsing for |data| and
+ // schedules the individual frames for decoding in the |frame_thread_pool_|.
+ StatusCode ParseAndSchedule(const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data);
+ // Decodes the |encoded_frame| and updates the
+ // |encoded_frame->temporal_unit|'s parameters if the decoded frame is a
+ // displayable frame. Used only in frame parallel mode.
+ StatusCode DecodeFrame(EncodedFrame* encoded_frame);
+
+ // Populates |buffer_| with values from |frame|. Adds a reference to |frame|
+ // in |output_frame_|.
+ StatusCode CopyFrameToOutputBuffer(const RefCountedBufferPtr& frame);
+ StatusCode DecodeTiles(const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<TileBuffer>& tile_buffers,
+ const DecoderState& state,
+ FrameScratchBuffer* frame_scratch_buffer,
+ RefCountedBuffer* current_frame);
+ // Applies film grain synthesis to the |displayable_frame| and stores the film
+ // grain applied frame into |film_grain_frame|. Returns kStatusOk on success.
+ StatusCode ApplyFilmGrain(const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const RefCountedBufferPtr& displayable_frame,
+ RefCountedBufferPtr* film_grain_frame,
+ ThreadPool* thread_pool);
+
+ bool IsNewSequenceHeader(const ObuParser& obu);
+
+ bool HasFailure() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ return failure_status_ != kStatusOk;
+ }
+
+ // Initializes the |quantizer_matrix_| if necessary and sets
+ // |quantizer_matrix_initialized_| to true.
+ bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
+
+ // Elements in this queue cannot be moved with std::move since the
+ // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
+ Queue<TemporalUnit> temporal_units_;
+ DecoderState state_;
+
+ DecoderBuffer buffer_ = {};
+ // |output_frame_| holds a reference to the output frame on behalf of
+ // |buffer_|.
+ RefCountedBufferPtr output_frame_;
+
+ // Queue of output frames that are to be returned in the DequeueFrame() calls.
+ // If |settings_.output_all_layers| is false, this queue will never contain
+ // more than 1 element. This queue is used only when |is_frame_parallel_| is
+ // false.
+ Queue<RefCountedBufferPtr> output_frame_queue_;
+
+ BufferPool buffer_pool_;
+ WedgeMaskArray wedge_masks_;
+ QuantizerMatrix quantizer_matrix_;
+ bool quantizer_matrix_initialized_ = false;
+ FrameScratchBufferPool frame_scratch_buffer_pool_;
+
+ // Used to synchronize the accesses into |temporal_units_| in order to update
+ // the "decoded" state of an temporal unit.
+ std::mutex mutex_;
+ std::condition_variable decoded_condvar_;
+ bool is_frame_parallel_;
+ std::unique_ptr<ThreadPool> frame_thread_pool_;
+
+ // In frame parallel mode, there are two primary points of failure:
+ // 1) ParseAndSchedule()
+ // 2) DecodeTiles()
+ // Both of these functions have to respond to the other one failing by
+ // aborting whatever they are doing. This variable is used to accomplish that.
+ // If |failure_status_| is not kStatusOk, then the two functions will try to
+ // abort as early as they can.
+ StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
+
+ ObuSequenceHeader sequence_header_ = {};
+ // If true, sequence_header is valid.
+ bool has_sequence_header_ = false;
+
+ const DecoderSettings& settings_;
+ bool seen_first_frame_ = false;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DECODER_IMPL_H_
diff --git a/src/decoder_settings.cc b/src/decoder_settings.cc
new file mode 100644
index 0000000..9399073
--- /dev/null
+++ b/src/decoder_settings.cc
@@ -0,0 +1,33 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_settings.h"
+
+extern "C" {
+
+void Libgav1DecoderSettingsInitDefault(Libgav1DecoderSettings* settings) {
+ settings->threads = 1;
+ settings->frame_parallel = 0; // false
+ settings->blocking_dequeue = 0; // false
+ settings->on_frame_buffer_size_changed = nullptr;
+ settings->get_frame_buffer = nullptr;
+ settings->release_frame_buffer = nullptr;
+ settings->release_input_buffer = nullptr;
+ settings->callback_private_data = nullptr;
+ settings->output_all_layers = 0; // false
+ settings->operating_point = 0;
+ settings->post_filter_mask = 0x1f;
+}
+
+} // extern "C"
diff --git a/src/decoder_state.h b/src/decoder_state.h
new file mode 100644
index 0000000..897c99f
--- /dev/null
+++ b/src/decoder_state.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_STATE_H_
+#define LIBGAV1_SRC_DECODER_STATE_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+struct DecoderState {
+ // Section 7.20. Updates frames in the reference_frame array with
+ // |current_frame|, based on the |refresh_frame_flags| bitmask.
+ void UpdateReferenceFrames(const RefCountedBufferPtr& current_frame,
+ int refresh_frame_flags) {
+ for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
+ ++ref_index, mask >>= 1) {
+ if ((mask & 1) != 0) {
+ reference_valid[ref_index] = true;
+ reference_frame_id[ref_index] = current_frame_id;
+ reference_frame[ref_index] = current_frame;
+ reference_order_hint[ref_index] = order_hint;
+ }
+ }
+ }
+
+ // Clears all the reference frames.
+ void ClearReferenceFrames() {
+ reference_valid = {};
+ reference_frame_id = {};
+ reference_order_hint = {};
+ for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
+ reference_frame[ref_index] = nullptr;
+ }
+ }
+
+ // reference_valid and reference_frame_id are used only if
+ // sequence_header_.frame_id_numbers_present is true.
+ // The reference_valid array is indexed by a reference picture slot number.
+ // A value (boolean) in the array signifies whether the corresponding
+ // reference picture slot is valid for use as a reference picture.
+ std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
+ std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
+ // A valid value of current_frame_id is an unsigned integer of at most 16
+ // bits. -1 indicates current_frame_id is not initialized.
+ int current_frame_id = -1;
+ // The RefOrderHint array variable in the spec.
+ std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
+ // The OrderHint variable in the spec. Its value comes from either the
+ // order_hint syntax element in the uncompressed header (if
+ // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
+ // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
+ // 5.9.2 and Section 7.4.
+ //
+ // NOTE: When show_existing_frame is false, it is often more convenient to
+ // just use the order_hint field of the frame header as OrderHint. So this
+ // field is mainly used to update the reference_order_hint array in
+ // UpdateReferenceFrames().
+ uint8_t order_hint = 0;
+ // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
+ // of the motion vector in time for each reference frame.
+ // * |false| indicates that the reference frame is a forwards reference (i.e.
+ // the reference frame is expected to be output before the current frame);
+ // * |true| indicates that the reference frame is a backwards reference.
+ // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
+ std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+ std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DECODER_STATE_H_
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
new file mode 100644
index 0000000..834e8b4
--- /dev/null
+++ b/src/dsp/arm/average_blend_neon.cc
@@ -0,0 +1,146 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
+ const int16_t* prediction_1) {
+ const int16x8_t pred0 = vld1q_s16(prediction_0);
+ const int16x8_t pred1 = vld1q_s16(prediction_1);
+ const int16x8_t res = vaddq_s16(pred0, pred1);
+ return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
+}
+
+inline void AverageBlendLargeRow(const int16_t* prediction_0,
+ const int16_t* prediction_1, const int width,
+ uint8_t* dest) {
+ int x = width;
+ do {
+ const int16x8_t pred_00 = vld1q_s16(prediction_0);
+ const int16x8_t pred_01 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
+ const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
+ const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
+ const int16x8_t pred_10 = vld1q_s16(prediction_0);
+ const int16x8_t pred_11 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
+ const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
+ const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
+ vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
+ dest += 16;
+ x -= 16;
+ } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = height;
+
+ if (width == 4) {
+ do {
+ const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+
+ StoreLo4(dst, result);
+ dst += dest_stride;
+ StoreHi4(dst, result);
+ dst += dest_stride;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+ dst += dest_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+ dst += dest_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->average_blend = AverageBlend_NEON;
+}
+
+} // namespace
+
+void AverageBlendInit_NEON() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/average_blend_neon.h b/src/dsp/arm/average_blend_neon.h
new file mode 100644
index 0000000..d13bcd6
--- /dev/null
+++ b/src/dsp/arm/average_blend_neon.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
new file mode 100644
index 0000000..4d0e76f
--- /dev/null
+++ b/src/dsp/arm/cdef_neon.cc
@@ -0,0 +1,697 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const uint8x8_t v_zero = vdup_n_u8(0);
+ // 00 01 02 03 04 05 06 07
+ // 00 10 11 12 13 14 15 16
+ *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6));
+ // 17 00 00 00 00 00 00 00
+ // 26 27 00 00 00 00 00 00
+ *partial_hi =
+ vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ uint8x16_t v_d1_temp[8];
+ const uint8x8_t v_zero = vdup_n_u8(0);
+ const uint8x16_t v_zero_16 = vdupq_n_u8(0);
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = vcombine_u8(v_src[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = vdupq_n_u16(0);
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const uint16x8_t v_zero = vdupq_n_u16(0);
+ uint16x8_t v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]);
+ v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]);
+ v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]);
+ v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = vdupq_n_u16(0);
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
+ ptrdiff_t stride, uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const auto* src = static_cast<const uint8_t*>(source);
+
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ uint8x8_t v_src[8];
+ for (int i = 0; i < 8; ++i) {
+ v_src[i] = vld1_u8(src);
+ src += stride;
+ }
+
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00
+ // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00
+ // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00
+ // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00
+ // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00
+ // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
+ // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
+ // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7);
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00
+ // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00
+ // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00
+ // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00
+ // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00
+ // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
+ // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
+ // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
+ const uint8x8_t v_zero = vdup_n_u8(0);
+ partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
+ for (int i = 1; i < 8; ++i) {
+ partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
+ }
+
+ // partial for direction 0
+ AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]);
+
+ // partial for direction 1
+ AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]);
+
+ // partial for direction 7
+ AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]);
+
+ uint8x8_t v_src_reverse[8];
+ for (int i = 0; i < 8; ++i) {
+ v_src_reverse[i] = vrev64_u8(v_src[i]);
+ }
+
+ // partial for direction 4
+ AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+ // partial for direction 3
+ AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+ // partial for direction 5
+ AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); }
+
+uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
+ return vmlal_u16(a, b, b);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+// Because everything is being summed into a single value the distributive
+// property allows us to mirror the division table and accumulate once.
+uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
+ const uint32x4_t division_table[4]) {
+ uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]);
+ c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]);
+ c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]);
+ c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]);
+ return SumVector(c);
+}
+
+// |cost[2]| and |cost[6]| square the input and accumulate:
+// cost[2] += Square(partial[2][i])
+uint32_t SquareAccumulate(const uint16x8_t a) {
+ uint32x4_t c = Square(vget_low_u16(a));
+ c = SquareAccumulate(c, vget_high_u16(a));
+ c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+ return SumVector(c);
+}
+
+uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
+ const uint32x4_t division_table[2]) {
+ // Remove elements 0-2.
+ uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
+ c = vaddq_u32(c, Square(vget_high_u16(a)));
+ c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+
+ c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
+ c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
+ return SumVector(c);
+}
+
+void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
+ uint8_t* const direction, int* const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t cost[8];
+ uint16x8_t partial_lo[8], partial_hi[8];
+
+ AddPartial(src, stride, partial_lo, partial_hi);
+
+ cost[2] = SquareAccumulate(partial_lo[2]);
+ cost[6] = SquareAccumulate(partial_lo[6]);
+
+ const uint32x4_t division_table[4] = {
+ vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+ vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
+
+ cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+ cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+ const uint32x4_t division_table_odd[2] = {
+ vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
+
+ const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
+
+ cost[1] =
+ CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd);
+ cost[3] =
+ CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd);
+ cost[5] =
+ CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd);
+ cost[7] =
+ CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+ uint16x8_t* output, const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = vld1q_u16(src + y_0 * stride + x_0);
+ output[1] = vld1q_u16(src - y_0 * stride - x_0);
+ output[2] = vld1q_u16(src + y_1 * stride + x_1);
+ output[3] = vld1q_u16(src - y_1 * stride - x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+ uint16x8_t* output, const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0),
+ vld1_u16(src + y_0 * stride + stride + x_0));
+ output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0),
+ vld1_u16(src - y_0 * stride + stride - x_0));
+ output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1),
+ vld1_u16(src + y_1 * stride + stride + x_1));
+ output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1),
+ vld1_u16(src - y_1 * stride + stride - x_1));
+}
+
+int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
+ const uint16x8_t threshold, const int16x8_t damping) {
+ // If reference > pixel, the difference will be negative, so covert to 0 or
+ // -1.
+ const uint16x8_t sign = vcgtq_u16(reference, pixel);
+ const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
+ const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const uint16x8_t thresh_minus_shifted_diff =
+ vqsubq_u16(threshold, shifted_diff);
+ const uint16x8_t clamp_abs_diff =
+ vminq_u16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return vreinterpretq_s16_u16(
+ vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign));
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_NEON(const uint16_t* src, const ptrdiff_t src_stride,
+ const int height, const int primary_strength,
+ const int secondary_strength, const int damping,
+ const int direction, void* dest,
+ const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint16x8_t cdef_large_value_mask =
+ vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
+ const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength);
+ const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength);
+
+ int16x8_t primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
+ primary_damping_shift =
+ vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
+ }
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
+ }
+
+ const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0];
+ const int primary_tap_1 = kCdefPrimaryTaps[primary_strength & 1][1];
+
+ int y = height;
+ do {
+ uint16x8_t pixel;
+ if (width == 8) {
+ pixel = vld1q_u16(src);
+ } else {
+ pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
+ }
+
+ uint16x8_t min = pixel;
+ uint16x8_t max = pixel;
+ int16x8_t sum;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ uint16x8_t primary_val[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val, direction);
+ }
+
+ if (clipping_required) {
+ min = vminq_u16(min, primary_val[0]);
+ min = vminq_u16(min, primary_val[1]);
+ min = vminq_u16(min, primary_val[2]);
+ min = vminq_u16(min, primary_val[3]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const uint8x16_t max_p01 =
+ vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]),
+ vreinterpretq_u8_u16(primary_val[1]));
+ const uint8x16_t max_p23 =
+ vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]),
+ vreinterpretq_u8_u16(primary_val[3]));
+ const uint16x8_t max_p =
+ vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23));
+ max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask));
+ }
+
+ sum = Constrain(primary_val[0], pixel, primary_threshold,
+ primary_damping_shift);
+ sum = vmulq_n_s16(sum, primary_tap_0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[1], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[2], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[3], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_1);
+ } else {
+ sum = vdupq_n_s16(0);
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ uint16x8_t secondary_val[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val, direction + 2);
+ LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+ }
+
+ if (clipping_required) {
+ min = vminq_u16(min, secondary_val[0]);
+ min = vminq_u16(min, secondary_val[1]);
+ min = vminq_u16(min, secondary_val[2]);
+ min = vminq_u16(min, secondary_val[3]);
+ min = vminq_u16(min, secondary_val[4]);
+ min = vminq_u16(min, secondary_val[5]);
+ min = vminq_u16(min, secondary_val[6]);
+ min = vminq_u16(min, secondary_val[7]);
+
+ const uint8x16_t max_s01 =
+ vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]),
+ vreinterpretq_u8_u16(secondary_val[1]));
+ const uint8x16_t max_s23 =
+ vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]),
+ vreinterpretq_u8_u16(secondary_val[3]));
+ const uint8x16_t max_s45 =
+ vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]),
+ vreinterpretq_u8_u16(secondary_val[5]));
+ const uint8x16_t max_s67 =
+ vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]),
+ vreinterpretq_u8_u16(secondary_val[7]));
+ const uint16x8_t max_s = vreinterpretq_u16_u8(
+ vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67)));
+ max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask));
+ }
+
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[0], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[1], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[2], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[3], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[4], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[5], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[6], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[7], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ }
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
+ sum = vaddq_s16(sum, sum_lt_0);
+ int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4);
+ if (clipping_required) {
+ result = vminq_s16(result, vreinterpretq_s16_u16(max));
+ result = vmaxq_s16(result, vreinterpretq_s16_u16(min));
+ }
+
+ const uint8x8_t dst_pixel = vqmovun_s16(result);
+ if (width == 8) {
+ src += src_stride;
+ vst1_u8(dst, dst_pixel);
+ dst += dst_stride;
+ --y;
+ } else {
+ src += src_stride << 1;
+ StoreLo4(dst, dst_pixel);
+ dst += dst_stride;
+ StoreHi4(dst, dst_pixel);
+ dst += dst_stride;
+ y -= 2;
+ }
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_NEON;
+ dsp->cdef_filters[0][0] = CdefFilter_NEON<4>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_NEON<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] = CdefFilter_NEON<4, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_NEON<8>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_NEON<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] = CdefFilter_NEON<8, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void CdefInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/cdef_neon.h b/src/dsp/arm/cdef_neon.h
new file mode 100644
index 0000000..53d5f86
--- /dev/null
+++ b/src/dsp/arm/cdef_neon.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
new file mode 100644
index 0000000..dcb7567
--- /dev/null
+++ b/src/dsp/arm/common_neon.h
@@ -0,0 +1,777 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cstdint>
+#include <cstring>
+
+#if 0
+#include <cstdio>
+
+#include "absl/strings/str_cat.h"
+
+constexpr bool kEnablePrintRegs = true;
+
+union DebugRegister {
+ int8_t i8[8];
+ int16_t i16[4];
+ int32_t i32[2];
+ uint8_t u8[8];
+ uint16_t u16[4];
+ uint32_t u32[2];
+};
+
+union DebugRegisterQ {
+ int8_t i8[16];
+ int16_t i16[8];
+ int32_t i32[4];
+ uint8_t u8[16];
+ uint16_t u16[8];
+ uint32_t u32[4];
+};
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintVect(const DebugRegister r, const char* const name, int size) {
+ int n;
+ if (kEnablePrintRegs) {
+ fprintf(stderr, "%s\t: ", name);
+ if (size == 8) {
+ for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+ } else if (size == 16) {
+ for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+ } else if (size == 32) {
+ for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
+// Debugging macro for 128-bit types.
+inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
+ int size) {
+ int n;
+ if (kEnablePrintRegs) {
+ fprintf(stderr, "%s\t: ", name);
+ if (size == 8) {
+ for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+ } else if (size == 16) {
+ for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+ } else if (size == 32) {
+ for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
+inline void PrintReg(const int32x4x2_t val, const std::string& name) {
+ DebugRegisterQ r;
+ vst1q_u32(r.u32, val.val[0]);
+ const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+ PrintVectQ(r, name0.c_str(), 32);
+ vst1q_u32(r.u32, val.val[1]);
+ const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+ PrintVectQ(r, name1.c_str(), 32);
+}
+
+inline void PrintReg(const uint32x4_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_u32(r.u32, val);
+ PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const uint32x2_t val, const char* name) {
+ DebugRegister r;
+ vst1_u32(r.u32, val);
+ PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const uint16x8_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_u16(r.u16, val);
+ PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const uint16x4_t val, const char* name) {
+ DebugRegister r;
+ vst1_u16(r.u16, val);
+ PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const uint8x16_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_u8(r.u8, val);
+ PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const uint8x8_t val, const char* name) {
+ DebugRegister r;
+ vst1_u8(r.u8, val);
+ PrintVect(r, name, 8);
+}
+
+inline void PrintReg(const int32x4_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_s32(r.i32, val);
+ PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const int32x2_t val, const char* name) {
+ DebugRegister r;
+ vst1_s32(r.i32, val);
+ PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const int16x8_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_s16(r.i16, val);
+ PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const int16x4_t val, const char* name) {
+ DebugRegister r;
+ vst1_s16(r.i16, val);
+ PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const int8x16_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_s8(r.i8, val);
+ PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const int8x8_t val, const char* name) {
+ DebugRegister r;
+ vst1_s8(r.i8, val);
+ PrintVect(r, name, 8);
+}
+
+// Print an individual (non-vector) value in decimal format.
+inline void PrintReg(const int x, const char* name) {
+ if (kEnablePrintRegs) {
+ printf("%s: %d\n", name, x);
+ }
+}
+
+// Print an individual (non-vector) value in hexadecimal format.
+inline void PrintHex(const int x, const char* name) {
+ if (kEnablePrintRegs) {
+ printf("%s: %x\n", name, x);
+ }
+}
+
+#define PR(x) PrintReg(x, #x)
+#define PD(x) PrintReg(x, #x)
+#define PX(x) PrintHex(x, #x)
+
+#endif // 0
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+// Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
+// the values. Use caution when using this in loops because it will re-zero the
+// register before loading on every iteration.
+inline uint8x8_t Load2(const void* const buf) {
+ const uint16x4_t zero = vdup_n_u16(0);
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return vreinterpret_u8_u16(
+ vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
+}
+
+// Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
+// register before loading the values. Use caution when using this in loops
+// because it will re-zero the register before loading on every iteration.
+inline uint8x8_t Load4(const void* const buf) {
+ const uint32x2_t zero = vdup_n_u32(0);
+ uint32_t temp;
+ memcpy(&temp, buf, 4);
+ return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
+}
+
+// Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
+template <int lane>
+inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
+ uint32_t temp;
+ memcpy(&temp, buf, 4);
+ return vreinterpret_u8_u32(
+ vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of the type (4 bytes in the case of uint32_t)
+// and add alignment hints to the memory access.
+template <typename T>
+inline void ValueToMem(void* const buf, T val) {
+ memcpy(buf, &val, sizeof(val));
+}
+
+// Store 4 int8_t values from the low half of an int8x8_t register.
+inline void StoreLo4(void* const buf, const int8x8_t val) {
+ ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
+}
+
+// Store 4 uint8_t values from the low half of a uint8x8_t register.
+inline void StoreLo4(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
+}
+
+// Store 4 uint8_t values from the high half of a uint8x8_t register.
+inline void StoreHi4(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+}
+
+// Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint16x8_t val) {
+ ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
+// register.
+template <int lane>
+inline void Store2(uint16_t* const buf, const uint16x4_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
+}
+
+//------------------------------------------------------------------------------
+// Bit manipulation.
+
+// vshXX_n_XX() requires an immediate.
+template <int shift>
+inline uint8x8_t LeftShift(const uint8x8_t vector) {
+ return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline uint8x8_t RightShift(const uint8x8_t vector) {
+ return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline int8x8_t RightShift(const int8x8_t vector) {
+ return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
+}
+
+// Shim vqtbl1_u8 for armv7.
+inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+ return vqtbl1_u8(a, index);
+#else
+ const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)};
+ return vtbl2_u8(b, index);
+#endif
+}
+
+// Shim vqtbl1_s8 for armv7.
+inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+ return vqtbl1_s8(a, index);
+#else
+ const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
+ return vtbl2_s8(b, vreinterpret_s8_u8(index));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Interleave.
+
+// vzipN is exclusive to A64.
+inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+ return vzip1_u8(a, b);
+#else
+ // Discard |.val[1]|
+ return vzip_u8(a, b).val[0];
+#endif
+}
+
+inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+ return vreinterpret_u8_u32(
+ vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+ // Discard |.val[1]|
+ return vreinterpret_u8_u32(
+ vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]);
+#endif
+}
+
+inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+ return vreinterpret_s8_u32(
+ vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+ // Discard |.val[1]|
+ return vreinterpret_s8_u32(
+ vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]);
+#endif
+}
+
+inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+ return vreinterpret_u8_u32(
+ vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+ // Discard |.val[0]|
+ return vreinterpret_u8_u32(
+ vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]);
+#endif
+}
+
+inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+ return vreinterpret_s8_u32(
+ vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+ // Discard |.val[0]|
+ return vreinterpret_s8_u32(
+ vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Sum.
+
+inline uint16_t SumVector(const uint8x8_t a) {
+#if defined(__aarch64__)
+ return vaddlv_u8(a);
+#else
+ const uint16x4_t c = vpaddl_u8(a);
+ const uint32x2_t d = vpaddl_u16(c);
+ const uint64x1_t e = vpaddl_u32(d);
+ return static_cast<uint16_t>(vget_lane_u64(e, 0));
+#endif // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x4_t a) {
+#if defined(__aarch64__)
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+ return static_cast<uint32_t>(vget_lane_u64(c, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Transpose.
+
+// Transpose 32 bit elements such that:
+// a: 00 01
+// b: 02 03
+// returns
+// val[0]: 00 02
+// val[1]: 01 03
+inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) {
+ const uint32x2_t a_32 = vreinterpret_u32_u8(a);
+ const uint32x2_t b_32 = vreinterpret_u32_u8(b);
+ const uint32x2x2_t c = vtrn_u32(a_32, b_32);
+ const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]),
+ vreinterpret_u8_u32(c.val[1])};
+ return d;
+}
+
+// Swap high and low 32 bit elements.
+inline uint8x8_t Transpose32(const uint8x8_t a) {
+ const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a));
+ return vreinterpret_u8_u32(b);
+}
+
+// Implement vtrnq_s64().
+// Input:
+// a0: 00 01 02 03 04 05 06 07
+// a1: 16 17 18 19 20 21 22 23
+// Output:
+// b0.val[0]: 00 01 02 03 16 17 18 19
+// b0.val[1]: 04 05 06 07 20 21 22 23
+inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
+ int16x8x2_t b0;
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+ return b0;
+}
+
+inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
+ uint16x8x2_t b0;
+ b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+ vreinterpret_u16_u32(vget_low_u32(a1)));
+ b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+ vreinterpret_u16_u32(vget_high_u32(a1)));
+ return b0;
+}
+
+// Input:
+// a: 00 01 02 03 10 11 12 13
+// b: 20 21 22 23 30 31 32 33
+// Output:
+// Note that columns [1] and [2] are transposed.
+// a: 00 10 20 30 02 12 22 32
+// b: 01 11 21 31 03 13 23 33
+inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
+ const uint16x4x2_t c =
+ vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b));
+ const uint32x2x2_t d =
+ vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1]));
+ const uint8x8x2_t e =
+ vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1]));
+ *a = e.val[0];
+ *b = e.val[1];
+}
+
+// Reversible if the x4 values are packed next to each other.
+// x4 input / x8 output:
+// a0: 00 01 02 03 40 41 42 43 44
+// a1: 10 11 12 13 50 51 52 53 54
+// a2: 20 21 22 23 60 61 62 63 64
+// a3: 30 31 32 33 70 71 72 73 74
+// x8 input / x4 output:
+// a0: 00 10 20 30 40 50 60 70
+// a1: 01 11 21 31 41 51 61 71
+// a2: 02 12 22 32 42 52 62 72
+// a3: 03 13 23 33 43 53 63 73
+inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2,
+ uint8x8_t* a3) {
+ const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+ const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+ const uint16x4x2_t c0 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+ *a0 = vreinterpret_u8_u16(c0.val[0]);
+ *a1 = vreinterpret_u8_u16(c1.val[0]);
+ *a2 = vreinterpret_u8_u16(c0.val[1]);
+ *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int8x8_t a[8]) {
+ // Swap 8 bit elements. Goes from:
+ // a[0]: 00 01 02 03 04 05 06 07
+ // a[1]: 10 11 12 13 14 15 16 17
+ // a[2]: 20 21 22 23 24 25 26 27
+ // a[3]: 30 31 32 33 34 35 36 37
+ // a[4]: 40 41 42 43 44 45 46 47
+ // a[5]: 50 51 52 53 54 55 56 57
+ // a[6]: 60 61 62 63 64 65 66 67
+ // a[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
+ // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
+ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
+ // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
+ const int8x16x2_t b0 =
+ vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5]));
+ const int8x16x2_t b1 =
+ vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7]));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
+ // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
+ // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
+ // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
+ const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]),
+ vreinterpretq_s16_s8(b1.val[0]));
+ const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]),
+ vreinterpretq_s16_s8(b1.val[1]));
+
+ // Unzip 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+
+ a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0]));
+ a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0]));
+ a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0]));
+ a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0]));
+ a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1]));
+ a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1]));
+ a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1]));
+ a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1]));
+}
+
+// Unsigned.
+inline void Transpose8x8(uint8x8_t a[8]) {
+ const uint8x16x2_t b0 =
+ vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5]));
+ const uint8x16x2_t b1 =
+ vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7]));
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+ a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+ a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+ a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+ a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
+ const uint8x16x2_t a0 =
+ vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
+ const uint8x16x2_t a1 =
+ vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
+
+ const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
+ vreinterpretq_u16_u8(a1.val[0]));
+ const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
+ vreinterpretq_u16_u8(a1.val[1]));
+
+ const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+
+ out[0] = vreinterpretq_u8_u32(c0.val[0]);
+ out[1] = vreinterpretq_u8_u32(c1.val[0]);
+ out[2] = vreinterpretq_u8_u32(c0.val[1]);
+ out[3] = vreinterpretq_u8_u32(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int16x8_t a[8]) {
+ const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+ a[0] = d0.val[0];
+ a[1] = d1.val[0];
+ a[2] = d2.val[0];
+ a[3] = d3.val[0];
+ a[4] = d0.val[1];
+ a[5] = d1.val[1];
+ a[6] = d2.val[1];
+ a[7] = d3.val[1];
+}
+
+// Unsigned.
+inline void Transpose8x8(uint16x8_t a[8]) {
+ const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+ const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+ const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]);
+ const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]);
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+ const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+ vreinterpretq_u32_u16(b3.val[0]));
+ const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+ vreinterpretq_u32_u16(b3.val[1]));
+
+ const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]);
+ const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]);
+ const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]);
+ const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]);
+
+ a[0] = d0.val[0];
+ a[1] = d1.val[0];
+ a[2] = d2.val[0];
+ a[3] = d3.val[0];
+ a[4] = d0.val[1];
+ a[5] = d1.val[1];
+ a[6] = d2.val[1];
+ a[7] = d3.val[1];
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
+// a[1]: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
+// a[2]: 20 21 22 23 24 25 26 27 a0 a1 a2 a3 a4 a5 a6 a7
+// a[3]: 30 31 32 33 34 35 36 37 b0 b1 b2 b3 b4 b5 b6 b7
+// a[4]: 40 41 42 43 44 45 46 47 c0 c1 c2 c3 c4 c5 c6 c7
+// a[5]: 50 51 52 53 54 55 56 57 d0 d1 d2 d3 d4 d5 d6 d7
+// a[6]: 60 61 62 63 64 65 66 67 e0 e1 e2 e3 e4 e5 e6 e7
+// a[7]: 70 71 72 73 74 75 76 77 f0 f1 f2 f3 f4 f5 f6 f7
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
+// a[1]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
+// a[2]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
+// a[3]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
+// a[4]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4
+// a[5]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5
+// a[6]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6
+// a[7]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7
+inline void Transpose8x16(uint8x16_t a[8]) {
+ // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
+ // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
+ // b1.val[0]: 20 30 22 32 24 34 26 36 a0 b0 a2 b2 a4 b4 a6 b6
+ // b1.val[1]: 21 31 23 33 25 35 27 37 a1 b1 a3 b3 a5 b5 a7 b7
+ // b2.val[0]: 40 50 42 52 44 54 46 56 c0 d0 c2 d2 c4 d4 c6 d6
+ // b2.val[1]: 41 51 43 53 45 55 47 57 c1 d1 c3 d3 c5 d5 c7 d7
+ // b3.val[0]: 60 70 62 72 64 74 66 76 e0 f0 e2 f2 e4 f4 e6 f6
+ // b3.val[1]: 61 71 63 73 65 75 67 77 e1 f1 e3 f3 e5 f5 e7 f7
+ const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
+ const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
+ const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
+ const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
+
+ // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 a0 b0 84 94 a4 b4
+ // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 a2 b2 86 96 a6 b6
+ // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 a1 b1 85 95 a5 b5
+ // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 a3 b3 87 97 a7 b7
+ // c2.val[0]: 40 50 60 70 44 54 64 74 c0 d0 e0 f0 c4 d4 e4 f4
+ // c2.val[1]: 42 52 62 72 46 56 66 76 c2 d2 e2 f2 c6 d6 e6 f6
+ // c3.val[0]: 41 51 61 71 45 55 65 75 c1 d1 e1 f1 c5 d5 e5 f5
+ // c3.val[1]: 43 53 63 73 47 57 67 77 c3 d3 e3 f3 c7 d7 e7 f7
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+
+ // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
+ // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4
+ // d1.val[0]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
+ // d1.val[1]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5
+ // d2.val[0]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
+ // d2.val[1]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6
+ // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
+ // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+
+ a[0] = vreinterpretq_u8_u32(d0.val[0]);
+ a[1] = vreinterpretq_u8_u32(d1.val[0]);
+ a[2] = vreinterpretq_u8_u32(d2.val[0]);
+ a[3] = vreinterpretq_u8_u32(d3.val[0]);
+ a[4] = vreinterpretq_u8_u32(d0.val[1]);
+ a[5] = vreinterpretq_u8_u32(d1.val[1]);
+ a[6] = vreinterpretq_u8_u32(d2.val[1]);
+ a[7] = vreinterpretq_u8_u32(d3.val[1]);
+}
+
+inline int16x8_t ZeroExtend(const uint8x8_t in) {
+ return vreinterpretq_s16_u16(vmovl_u8(in));
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_ENABLE_NEON
+#endif // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
new file mode 100644
index 0000000..fd9b912
--- /dev/null
+++ b/src/dsp/arm/convolve_neon.cc
@@ -0,0 +1,3105 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index, bool negative_outside_taps = false>
+int16x8_t SumOnePassTaps(const uint8x8_t* const src,
+ const uint8x8_t* const taps) {
+ uint16x8_t sum;
+ if (filter_index == 0) {
+ // 6 taps. + - + + - +
+ sum = vmull_u8(src[0], taps[0]);
+ // Unsigned overflow will result in a valid int16_t value.
+ sum = vmlsl_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlsl_u8(sum, src[4], taps[4]);
+ sum = vmlal_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 1 && negative_outside_taps) {
+ // 6 taps. - + + + + -
+ // Set a base we can subtract from.
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlsl_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 1) {
+ // 6 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlal_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 2) {
+ // 8 taps. - + - + + - + -
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlsl_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlsl_u8(sum, src[5], taps[5]);
+ sum = vmlal_u8(sum, src[6], taps[6]);
+ sum = vmlsl_u8(sum, src[7], taps[7]);
+ } else if (filter_index == 3) {
+ // 2 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ } else if (filter_index == 4) {
+ // 4 taps. - + + -
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlsl_u8(sum, src[3], taps[3]);
+ } else if (filter_index == 5) {
+ // 4 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ }
+ return vreinterpretq_s16_u16(sum);
+}
+
+template <int filter_index, bool negative_outside_taps>
+int16x8_t SumHorizontalTaps(const uint8_t* const src,
+ const uint8x8_t* const v_tap) {
+ uint8x8_t v_src[8];
+ const uint8x16_t src_long = vld1q_u8(src);
+ int16x8_t sum;
+
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
+ }
+ return sum;
+}
+
+template <int filter_index, bool negative_outside_taps>
+uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
+ const uint8x8_t* const v_tap) {
+ int16x8_t sum =
+ SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ return vqrshrun_n_s16(sum, kFilterBits - 1);
+}
+
+template <int filter_index, bool negative_outside_taps>
+uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
+ const uint8x8_t* const v_tap) {
+ const int16x8_t sum =
+ SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
+
+ return vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index>
+int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const uint8x8_t* const v_tap) {
+ uint16x8_t sum;
+ const uint8x8_t input0 = vld1_u8(src);
+ src += src_stride;
+ const uint8x8_t input1 = vld1_u8(src);
+ uint8x8x2_t input = vzip_u8(input0, input1);
+
+ if (filter_index == 3) {
+ // tap signs : + +
+ sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
+ sum = vmlal_u8(sum, input.val[1], v_tap[4]);
+ } else if (filter_index == 4) {
+ // tap signs : - + + -
+ sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
+ sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
+ sum = vmlal_u8(sum, input.val[1], v_tap[4]);
+ sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
+ } else {
+ // tap signs : + + + +
+ sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
+ sum = vmlal_u8(sum, input.val[1], v_tap[4]);
+ sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
+ }
+
+ return vreinterpretq_s16_u16(sum);
+}
+
+template <int filter_index>
+uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const uint8x8_t* const v_tap) {
+ int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ return vqrshrun_n_s16(sum, kFilterBits - 1);
+}
+
+template <int filter_index>
+uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const uint8x8_t* const v_tap) {
+ const int16x8_t sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int num_taps, int step, int filter_index,
+ bool negative_outside_taps = true, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ // 4 tap filters are never used when width > 4.
+ if (num_taps != 4 && width > 4) {
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ if (is_2d || is_compound) {
+ const uint16x8_t v_sum =
+ HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
+ v_tap);
+ vst1q_u16(&dest16[x], v_sum);
+ } else {
+ const uint8x8_t result =
+ SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
+ v_tap);
+ vst1_u8(&dest8[x], result);
+ }
+ x += step;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (++y < height);
+ return;
+ }
+
+ // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+ // |width| <= 4.
+ assert(width <= 4);
+ assert(num_taps <= 4);
+ if (num_taps <= 4) {
+ if (width == 4) {
+ int y = 0;
+ do {
+ if (is_2d || is_compound) {
+ const uint16x8_t v_sum =
+ HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
+ v_tap);
+ vst1_u16(dest16, vget_low_u16(v_sum));
+ } else {
+ const uint8x8_t result =
+ SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
+ v_tap);
+ StoreLo4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (++y < height);
+ return;
+ }
+
+ if (!is_compound) {
+ int y = 0;
+ do {
+ if (is_2d) {
+ const uint16x8_t sum =
+ HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ dest16[0] = vgetq_lane_u16(sum, 0);
+ dest16[1] = vgetq_lane_u16(sum, 2);
+ dest16 += pred_stride;
+ dest16[0] = vgetq_lane_u16(sum, 1);
+ dest16[1] = vgetq_lane_u16(sum, 3);
+ dest16 += pred_stride;
+ } else {
+ const uint8x8_t sum =
+ SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ dest8[0] = vget_lane_u8(sum, 0);
+ dest8[1] = vget_lane_u8(sum, 2);
+ dest8 += pred_stride;
+
+ dest8[0] = vget_lane_u8(sum, 1);
+ dest8[1] = vget_lane_u8(sum, 3);
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y += 2;
+ } while (y < height - 1);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ uint16x8_t sum;
+ const uint8x8_t input = vld1_u8(src);
+ if (filter_index == 3) { // |num_taps| == 2
+ sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
+ sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+ } else if (filter_index == 4) {
+ sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
+ sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
+ sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+ sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
+ } else {
+ assert(filter_index == 5);
+ sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
+ sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
+ sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+ sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
+ }
+ // |sum| contains an int16_t value.
+ sum = vreinterpretq_u16_s16(vrshrq_n_s16(
+ vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
+ Store2<0>(dest16, sum);
+ }
+ }
+ }
+}
+
+// Process 16 bit inputs and output 32 bits.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+ const int16x8_t taps) {
+ const int16x4_t taps_lo = vget_low_s16(taps);
+ const int16x4_t taps_hi = vget_high_s16(taps);
+ int32x4_t sum;
+ if (num_taps == 8) {
+ sum = vmull_lane_s16(src[0], taps_lo, 0);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+ sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+ sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+ sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+ } else if (num_taps == 6) {
+ sum = vmull_lane_s16(src[0], taps_lo, 1);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+ sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+ } else if (num_taps == 4) {
+ sum = vmull_lane_s16(src[0], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+ } else if (num_taps == 2) {
+ sum = vmull_lane_s16(src[0], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+ }
+
+ if (is_compound) {
+ return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+ }
+
+ return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+ const int16x8_t taps) {
+ const int16x4_t taps_lo = vget_low_s16(taps);
+ const int16x4_t taps_hi = vget_high_s16(taps);
+ int32x4_t sum_lo, sum_hi;
+ if (num_taps == 8) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
+ } else if (num_taps == 6) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
+ } else if (num_taps == 4) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
+ } else if (num_taps == 2) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
+ }
+
+ if (is_compound) {
+ return vcombine_s16(
+ vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+ vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const int16x8_t taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ int16x8_t srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+ } else {
+ vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
+ if (num_taps >= 6) {
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
+ if (num_taps == 8) {
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+ vget_low_s16(srcs[num_taps]));
+
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = vreinterpretq_u16_s16(sum);
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqmovun_s16(sum);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y += 2;
+ } while (y < height);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ if (num_taps == 8) {
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ } else if (num_taps == 4) {
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ } else if (num_taps == 6) {
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+ } else if (num_taps == 8) {
+ srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+ srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+ srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+ }
+
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const uint8x8_t results = vqmovun_s16(sum);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y += 4;
+ } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ // Duplicate the absolute value for each tap. Negative taps are corrected
+ // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8.
+ uint8x8_t v_tap[kSubPixelTaps];
+ assert(filter_id != 0);
+
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+ }
+
+ if (filter_index == 2) { // 8 tap.
+ FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ // Check if outside taps are positive.
+ if ((filter_id == 1) | (filter_id == 15)) {
+ FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else {
+ FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+ } else if (filter_index == 0) { // 6 tap.
+ FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 4) { // 4 tap.
+ FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+void Convolve2D_NEON(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+ width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const int16x8_t taps = vmovl_s8(
+ vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+
+ if (vertical_taps == 8) {
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 6) {
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 4) {
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else { // |vertical_taps| == 2
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+ uint8x8x3_t ret;
+ const uint8x16_t src_val = vld1q_u8(src_x);
+ ret.val[0] = vget_low_u8(src_val);
+ ret.val[1] = vget_high_u8(src_val);
+ if (grade_x > 1) {
+ ret.val[2] = vld1_u8(src_x + 16);
+ }
+ return ret;
+}
+
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
+inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
+ assert(tap_index < 2);
+ alignas(
+ 16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+ {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+ {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+
+ return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
+}
+
+template <int grade_x>
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const int width, const int subpixel_x,
+ const int step_x,
+ const int intermediate_height,
+ int16_t* intermediate) {
+ // Account for the 0-taps that precede the 2 nonzero taps.
+ const int kernel_offset = 3;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
+ const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+ int p = subpixel_x;
+ if (width <= 4) {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+ // This is a special case. The 2-tap filter has no negative taps, so we
+ // can use unsigned values.
+ // For each x, a lane of tapsK has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices)};
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16_t src_vals = vld1q_u8(src_x);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+ // For each x, a lane of srcK contains src_x[k].
+ const uint8x8_t src[2] = {
+ VQTbl1U8(src_vals, src_indices),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (++y < intermediate_height);
+ return;
+ }
+
+ // |width| >= 8
+ int x = 0;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // This is a special case. The 2-tap filter has no negative taps, so we
+ // can use unsigned values.
+ // For each x, a lane of tapsK has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices)};
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+ // For each x, a lane of srcK contains src_x[k].
+ const uint8x8_t src[2] = {
+ vtbl3_u8(src_vals, src_indices),
+ vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+ vst1q_s16(intermediate_x,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (++y < intermediate_height);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
+inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
+ assert(tap_index < 4);
+ alignas(
+ 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+ {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+
+ return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+void ConvolveKernelHorizontalPositive4Tap(
+ const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+ const int step_x, const int intermediate_height, int16_t* intermediate) {
+ const int kernel_offset = 2;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
+ const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
+ const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
+ const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ const int p = subpixel_x;
+ // First filter is special, just a 128 tap on the center.
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices = vand_u8(
+ vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
+ // Note that filter_id depends on x.
+ // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+ const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices),
+ VQTbl1U8(filter_taps2, filter_indices),
+ VQTbl1U8(filter_taps3, filter_indices)};
+
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped index vectors.
+ const uint8x16_t src_vals = vld1q_u8(src_x);
+
+ // For each x, srcK contains src_x[k] where k=1.
+ // Whereas taps come from different arrays, src pixels are drawn from the
+ // same contiguous line.
+ const uint8x8_t src[4] = {
+ VQTbl1U8(src_vals, src_indices),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
+
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (++y < intermediate_height);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
+inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
+ assert(tap_index < 4);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+ {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
+
+ return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalSigned4Tap(
+ const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+ const int step_x, const int intermediate_height, int16_t* intermediate) {
+ const int kernel_offset = 2;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
+ const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
+ const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
+ const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
+ const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
+ static_cast<uint16_t>(step_x));
+
+ const int p = subpixel_x;
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
+ const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
+ const uint8x8_t filter_index_offsets = vshrn_n_u16(
+ vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
+ const uint8x8_t filter_indices =
+ vand_u8(filter_index_offsets, filter_index_mask);
+ // Note that filter_id depends on x.
+ // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+ const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices),
+ VQTbl1U8(filter_taps2, filter_indices),
+ VQTbl1U8(filter_taps3, filter_indices)};
+
+ const uint8x8_t src_indices_base =
+ vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
+
+ const uint8x8_t src_indices[4] = {src_indices_base,
+ vadd_u8(src_indices_base, vdup_n_u8(1)),
+ vadd_u8(src_indices_base, vdup_n_u8(2)),
+ vadd_u8(src_indices_base, vdup_n_u8(3))};
+
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16_t src_vals = vld1q_u8(src_x);
+
+ // For each x, srcK contains src_x[k] where k=1.
+ // Whereas taps come from different arrays, src pixels are drawn from the
+ // same contiguous line.
+ const uint8x8_t src[4] = {
+ VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
+ VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
+
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (++y < intermediate_height);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
+ assert(tap_index < 6);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+ {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+ {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
+ {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+ return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+ const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* intermediate) {
+ const int kernel_offset = 1;
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x16_t filter_taps[6];
+ for (int i = 0; i < 6; ++i) {
+ filter_taps[i] = GetSigned6TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ // Avoid overloading outside the reference boundaries. This means
+ // |trailing_width| can be up to 24.
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[6];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 6; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ uint8x8_t taps[6];
+ for (int i = 0; i < 6; ++i) {
+ taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+ }
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ const uint8x8_t src[6] = {
+ vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+ vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+ vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
+
+ vst1q_s16(intermediate_x,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (++y < intermediate_height);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps which are handled in
+// GetMixed6TapFilter().
+inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
+ assert(tap_index < 6);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
+ {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
+
+ return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
+}
+
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+ assert(tap_index < 2);
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
+ {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+ return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+ const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* intermediate) {
+ const int kernel_offset = 1;
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x8_t taps[4];
+ int16x8_t mixed_taps[2];
+ uint8x16_t positive_filter_taps[4];
+ for (int i = 0; i < 4; ++i) {
+ positive_filter_taps[i] = GetPositive6TapFilter(i);
+ }
+ int8x16_t mixed_filter_taps[2];
+ mixed_filter_taps[0] = GetMixed6TapFilter(0);
+ mixed_filter_taps[1] = GetMixed6TapFilter(1);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[6];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 6; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ for (int i = 0; i < 4; ++i) {
+ taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
+ }
+ mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
+ mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
+
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ int16x8_t sum_mixed = vmulq_s16(
+ mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
+ sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
+ ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
+ uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
+ sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
+ sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
+ sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
+ sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
+
+ vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (++y < intermediate_height);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
+ assert(tap_index < 8);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+ {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
+ {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+ {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
+ {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+ {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+ {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
+ {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+ {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
+
+ return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+ const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* intermediate) {
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x8_t taps[8];
+ uint8x16_t filter_taps[8];
+ for (int i = 0; i < 8; ++i) {
+ filter_taps[i] = GetSigned8TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[8];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 8; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ for (int i = 0; i < 8; ++i) {
+ taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+ }
+
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ const uint8x8_t src[8] = {
+ vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+ vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+ vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
+ vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
+
+ vst1q_s16(intermediate_x,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (++y < intermediate_height);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// This function handles blocks of width 2 or 4.
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+ const int filter_index, const int step_y,
+ const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ const int16_t* src_y = src;
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ uint16_t* dest16_y = static_cast<uint16_t*>(dest);
+ uint8_t* dest_y = static_cast<uint8_t*>(dest);
+ int16x4_t s[num_taps + grade_y];
+
+ int p = subpixel_y & 1023;
+ int prev_p = p;
+ int y = 0;
+ do { // y < height
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = vld1_s16(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ int16x8_t filter =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+ if (is_compound) {
+ assert(width != 2);
+ const uint16x4_t result = vreinterpret_u16_s16(sums);
+ vst1_u16(dest16_y, result);
+ } else {
+ const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+ if (width == 2) {
+ Store2<0>(dest_y, result);
+ } else {
+ StoreLo4(dest_y, result);
+ }
+ }
+ p += step_y;
+ const int p_diff =
+ (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+ prev_p = p;
+ // Here we load extra source in case it is needed. If |p_diff| == 0, these
+ // values will be unused, but it's faster to load than to branch.
+ s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+ if (grade_y > 1) {
+ s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+ }
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ filter_id = (p >> 6) & kSubPixelMask;
+ filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+ if (is_compound) {
+ assert(width != 2);
+ const uint16x4_t result = vreinterpret_u16_s16(sums);
+ vst1_u16(dest16_y, result);
+ } else {
+ const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+ if (width == 2) {
+ Store2<0>(dest_y, result);
+ } else {
+ StoreLo4(dest_y, result);
+ }
+ }
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ prev_p = p;
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ y += 2;
+ } while (y < height);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* src, const int width,
+ const int subpixel_y, const int filter_index,
+ const int step_y, const int height,
+ void* dest, const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ // A possible improvement is to use arithmetic to decide how many times to
+ // apply filters to same source before checking whether to load new srcs.
+ // However, this will only improve performance with very small step sizes.
+ int16x8_t s[num_taps + grade_y];
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ uint16_t* dest16_y;
+ uint8_t* dest_y;
+
+ int x = 0;
+ do { // x < width
+ const int16_t* src_x = src + x;
+ const int16_t* src_y = src_x;
+ dest16_y = static_cast<uint16_t*>(dest) + x;
+ dest_y = static_cast<uint8_t*>(dest) + x;
+ int p = subpixel_y & 1023;
+ int prev_p = p;
+ int y = 0;
+ do { // y < height
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = vld1q_s16(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ int16x8_t filter =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+ if (is_compound) {
+ vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+ } else {
+ vst1_u8(dest_y, vqmovun_s16(sum));
+ }
+ p += step_y;
+ const int p_diff =
+ (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+ // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
+ // needed. Otherwise, we only need to load one vector because |p_diff|
+ // can't exceed 1.
+ s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+ if (grade_y > 1) {
+ s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+ }
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ filter_id = (p >> 6) & kSubPixelMask;
+ filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+ if (is_compound) {
+ vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+ } else {
+ vst1_u8(dest_y, vqmovun_s16(sum));
+ }
+ p += step_y;
+ src_y = src_x + (p >> kScaleSubPixelBits) * src_stride;
+ prev_p = p;
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ y += 2;
+ } while (y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index, const int subpixel_x,
+ const int subpixel_y, const int step_x,
+ const int step_y, const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ num_vert_taps;
+ assert(step_x <= 2048);
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (2 * kMaxSuperBlockSizeInPixels + 8)];
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [3, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src += vert_kernel_offset * src_stride;
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base subpel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // larger structure and use a larger table lookup in order to gather all
+ // filter inputs.
+ // |num_taps| - 1 is the shuffle index of the final filter input.
+ const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+ switch (filter_index) {
+ case 0:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalSigned6Tap<2>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontalSigned6Tap<1>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 1:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+
+ } else {
+ ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 2:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalSigned8Tap<2>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontalSigned8Tap<1>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 3:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 4:
+ assert(width <= 4);
+ ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
+ intermediate_height, intermediate);
+ break;
+ default:
+ assert(filter_index == 5);
+ ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
+ intermediate_height, intermediate);
+ }
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+
+ switch (filter_index) {
+ case 0:
+ case 1:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 1, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 2, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+ break;
+ case 2:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 1, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 2, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+ break;
+ case 3:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 1, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 2, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+ break;
+ case 4:
+ default:
+ assert(filter_index == 4 || filter_index == 5);
+ assert(height <= 4);
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 1, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 2, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+ }
+}
+
+void ConvolveHorizontal_NEON(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int /*vertical_filter_index*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+uint16x8_t Compound1DShift(const int16x8_t sum) {
+ return vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index, bool is_compound = false,
+ bool negative_outside_taps = false>
+void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int width, const int height,
+ const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 8);
+
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ uint8x8_t srcs[8];
+ srcs[0] = vld1_u8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[2] = vld1_u8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[4] = vld1_u8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[6] = vld1_u8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = vld1_u8(src_x);
+ src_x += src_stride;
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+ vst1q_u16(dst16 + x + y * dst_stride, results);
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+ vst1_u8(dst8 + x + y * dst_stride, results);
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false,
+ bool negative_outside_taps = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ uint8x8_t srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4<0>(src, srcs[2]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+ int y = 0;
+ do {
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4<0>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ srcs[6] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4(src);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+ int y = 0;
+ do {
+ srcs[4] = Load4<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[6] = Load4<0>(src, srcs[6]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4(src);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+ srcs[4] = Load4<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[6] = Load4(src);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+ int y = 0;
+ do {
+ srcs[6] = Load4<1>(src, srcs[6]);
+ src += src_stride;
+ srcs[8] = Load4<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[7] = vext_u8(srcs[6], srcs[8], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y += 2;
+ } while (y < height);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ uint8x8_t srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 2);
+
+ // This uses srcs[0]..srcs[1].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+ // This uses srcs[0]..srcs[3].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+
+ int y = 0;
+ do {
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+
+ // This uses srcs[0]..srcs[5].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+ int y = 0;
+ do {
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ srcs[6] = vext_u8(srcs[4], srcs[8], 4);
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+ srcs[7] = vext_u8(srcs[4], srcs[8], 6);
+
+ // This uses srcs[0]..srcs[7].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ }
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+void ConvolveVertical_NEON(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ uint8x8_t taps[8];
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ taps[k] =
+ vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+ }
+
+ if (filter_index == 0) { // 6 tap.
+ if (width == 2) {
+ FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else {
+ FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ taps + 1);
+ }
+ } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
+ (vertical_filter_id == 15))) { // 5 tap.
+ if (width == 2) {
+ FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else {
+ FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+ taps + 1);
+ }
+ } else if ((filter_index == 1) &
+ ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
+ (vertical_filter_id == 9))) { // 6 tap with weird negative taps.
+ if (width == 2) {
+ FilterVertical2xH<1,
+ /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, height, taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/false,
+ /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps + 1);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ if (width == 2) {
+ FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+ taps + 3);
+ } else if (width == 4) {
+ FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+ taps + 3);
+ } else {
+ FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ taps + 3);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ // Outside taps are negative.
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else if (width == 4) {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else {
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+ taps + 2);
+ }
+ } else {
+ // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+ // below map to 4 tap filters.
+ assert(filter_index == 5 ||
+ (filter_index == 1 &&
+ (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+ vertical_filter_id == 4 || vertical_filter_id == 5 ||
+ vertical_filter_id == 6 || vertical_filter_id == 10 ||
+ vertical_filter_id == 11 || vertical_filter_id == 12 ||
+ vertical_filter_id == 13 || vertical_filter_id == 14)));
+ // According to GetNumTapsInFilter() this has 6 taps but here we are
+ // treating it as though it has 4.
+ if (filter_index == 1) src += src_stride;
+ if (width == 2) {
+ FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else if (width == 4) {
+ FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else {
+ FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+ taps + 2);
+ }
+ }
+}
+
+void ConvolveCompoundCopy_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const ptrdiff_t src_stride = reference_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ constexpr int final_shift =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+ if (width >= 16) {
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const uint8x16_t v_src = vld1q_u8(&src[x]);
+ const uint16x8_t v_dest_lo =
+ vshll_n_u8(vget_low_u8(v_src), final_shift);
+ const uint16x8_t v_dest_hi =
+ vshll_n_u8(vget_high_u8(v_src), final_shift);
+ vst1q_u16(&dest[x], v_dest_lo);
+ x += 8;
+ vst1q_u16(&dest[x], v_dest_hi);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ dest += width;
+ } while (++y < height);
+ } else if (width == 8) {
+ int y = 0;
+ do {
+ const uint8x8_t v_src = vld1_u8(&src[0]);
+ const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+ vst1q_u16(&dest[0], v_dest);
+ src += src_stride;
+ dest += width;
+ } while (++y < height);
+ } else { /* width == 4 */
+ uint8x8_t v_src = vdup_n_u8(0);
+
+ int y = 0;
+ do {
+ v_src = Load4<0>(&src[0], v_src);
+ src += src_stride;
+ v_src = Load4<1>(&src[0], v_src);
+ src += src_stride;
+ const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+ vst1q_u16(&dest[0], v_dest);
+ dest += 4 << 1;
+ y += 2;
+ } while (y < height);
+ }
+}
+
+void ConvolveCompoundVertical_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int vertical_filter_index,
+ const int /*horizontal_filter_id*/, const int vertical_filter_id,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ uint8x8_t taps[8];
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ taps[k] =
+ vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+ }
+
+ if (filter_index == 0) { // 6 tap.
+ if (width == 4) {
+ FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 1);
+ } else {
+ FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 1);
+ }
+ } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
+ (vertical_filter_id == 15))) { // 5 tap.
+ if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 1);
+ }
+ } else if ((filter_index == 1) &
+ ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
+ (vertical_filter_id == 9))) { // 6 tap with weird negative taps.
+ if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/true,
+ /*negative_outside_taps=*/true>(src, src_stride, dest,
+ 4, height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
+ src, src_stride, dest, width, width, height, taps + 1);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ if (width == 4) {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ if (width == 4) {
+ FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 3);
+ } else {
+ FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 3);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ if (width == 4) {
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 2);
+ } else {
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 2);
+ }
+ } else {
+ // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+ // to 4 tap filters.
+ assert(filter_index == 5 ||
+ (filter_index == 1 &&
+ (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+ vertical_filter_id == 4 || vertical_filter_id == 5 ||
+ vertical_filter_id == 6 || vertical_filter_id == 10 ||
+ vertical_filter_id == 11 || vertical_filter_id == 12 ||
+ vertical_filter_id == 13 || vertical_filter_id == 14)));
+ // According to GetNumTapsInFilter() this has 6 taps but here we are
+ // treating it as though it has 4.
+ if (filter_index == 1) src += src_stride;
+ if (width == 4) {
+ FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 2);
+ } else {
+ FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 2);
+ }
+ }
+}
+
+void ConvolveCompoundHorizontal_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index, const int /*vertical_filter_index*/,
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint16_t*>(prediction);
+
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
+ filter_index);
+}
+
+void ConvolveCompound2D_NEON(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ const ptrdiff_t dest_stride = width;
+ const int16x8_t taps = vmovl_s8(
+ vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+
+ if (vertical_taps == 8) {
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+}
+
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+ const uint8x16_t left = vld1q_u8(src);
+ const uint8x16_t right = vld1q_u8(src + 1);
+ vst1q_u8(dst, vrhaddq_u8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+ int y = 0;
+ do {
+ HalfAddHorizontal(src, dst);
+ if (width >= 32) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width >= 64) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width == 128) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (++y < height);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+ const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ int y = 0;
+ do {
+ const uint8x8_t left = vld1_u8(src);
+ const uint8x8_t right = vld1_u8(src + 1);
+ vst1_u8(dest, vrhadd_u8(left, right));
+
+ src += reference_stride;
+ dest += pred_stride;
+ } while (++y < height);
+ } else if (width == 4) {
+ uint8x8_t left = vdup_n_u8(0);
+ uint8x8_t right = vdup_n_u8(0);
+ int y = 0;
+ do {
+ left = Load4<0>(src, left);
+ right = Load4<0>(src + 1, right);
+ src += reference_stride;
+ left = Load4<1>(src, left);
+ right = Load4<1>(src + 1, right);
+ src += reference_stride;
+
+ const uint8x8_t result = vrhadd_u8(left, right);
+
+ StoreLo4(dest, result);
+ dest += pred_stride;
+ StoreHi4(dest, result);
+ dest += pred_stride;
+ y += 2;
+ } while (y < height);
+ } else {
+ assert(width == 2);
+ uint8x8_t left = vdup_n_u8(0);
+ uint8x8_t right = vdup_n_u8(0);
+ int y = 0;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<1>(src, left);
+ right = Load2<1>(src + 1, right);
+ src += reference_stride;
+
+ const uint8x8_t result = vrhadd_u8(left, right);
+
+ Store2<0>(dest, result);
+ dest += pred_stride;
+ Store2<1>(dest, result);
+ dest += pred_stride;
+ y += 2;
+ } while (y < height);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+ uint8x16_t row[8], below[8];
+
+ row[0] = vld1q_u8(src);
+ if (width >= 32) {
+ src += 16;
+ row[1] = vld1q_u8(src);
+ if (width >= 64) {
+ src += 16;
+ row[2] = vld1q_u8(src);
+ src += 16;
+ row[3] = vld1q_u8(src);
+ if (width == 128) {
+ src += 16;
+ row[4] = vld1q_u8(src);
+ src += 16;
+ row[5] = vld1q_u8(src);
+ src += 16;
+ row[6] = vld1q_u8(src);
+ src += 16;
+ row[7] = vld1q_u8(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = 0;
+ do {
+ below[0] = vld1q_u8(src);
+ if (width >= 32) {
+ src += 16;
+ below[1] = vld1q_u8(src);
+ if (width >= 64) {
+ src += 16;
+ below[2] = vld1q_u8(src);
+ src += 16;
+ below[3] = vld1q_u8(src);
+ if (width == 128) {
+ src += 16;
+ below[4] = vld1q_u8(src);
+ src += 16;
+ below[5] = vld1q_u8(src);
+ src += 16;
+ below[6] = vld1q_u8(src);
+ src += 16;
+ below[7] = vld1q_u8(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
+ row[0] = below[0];
+ if (width >= 32) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
+ row[1] = below[1];
+ if (width >= 64) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
+ row[2] = below[2];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
+ row[3] = below[3];
+ if (width >= 128) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
+ row[4] = below[4];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
+ row[5] = below[5];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
+ row[6] = below[6];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
+ row[7] = below[7];
+ }
+ }
+ }
+ dst += dst_remainder_stride;
+ } while (++y < height);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ uint8x8_t row, below;
+ row = vld1_u8(src);
+ src += reference_stride;
+
+ int y = 0;
+ do {
+ below = vld1_u8(src);
+ src += reference_stride;
+
+ vst1_u8(dest, vrhadd_u8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (++y < height);
+ } else if (width == 4) {
+ uint8x8_t row = Load4(src);
+ uint8x8_t below = vdup_n_u8(0);
+ src += reference_stride;
+
+ int y = 0;
+ do {
+ below = Load4<0>(src, below);
+ src += reference_stride;
+
+ StoreLo4(dest, vrhadd_u8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (++y < height);
+ } else {
+ assert(width == 2);
+ uint8x8_t row = Load2(src);
+ uint8x8_t below = vdup_n_u8(0);
+ src += reference_stride;
+
+ int y = 0;
+ do {
+ below = Load2<0>(src, below);
+ src += reference_stride;
+
+ Store2<0>(dest, vrhadd_u8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (++y < height);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ uint16x8_t row[16];
+ row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 16) {
+ src += 8;
+ row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 32) {
+ src += 8;
+ row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 64) {
+ src += 8;
+ row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width == 128) {
+ src += 8;
+ row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = 0;
+ do {
+ const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
+ row[0] = below_0;
+ if (width >= 16) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
+ row[1] = below_1;
+ if (width >= 32) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
+ row[2] = below_2;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
+ row[3] = below_3;
+ if (width >= 64) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
+ row[4] = below_4;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
+ row[5] = below_5;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
+ row[6] = below_6;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
+ row[7] = below_7;
+ if (width == 128) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
+ row[8] = below_8;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
+ row[9] = below_9;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_10 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
+ row[10] = below_10;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_11 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
+ row[11] = below_11;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_12 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
+ row[12] = below_12;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_13 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
+ row[13] = below_13;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_14 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
+ row[14] = below_14;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_15 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
+ row[15] = below_15;
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (++y < height);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+
+ if (width == 128) {
+ IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 8) {
+ IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 4) {
+ uint8x8_t left = Load4(src);
+ uint8x8_t right = Load4(src + 1);
+ src += reference_stride;
+
+ uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+ int y = 0;
+ do {
+ left = Load4<0>(src, left);
+ right = Load4<0>(src + 1, right);
+ src += reference_stride;
+ left = Load4<1>(src, left);
+ right = Load4<1>(src + 1, right);
+ src += reference_stride;
+
+ const uint16x8_t below = vaddl_u8(left, right);
+
+ const uint8x8_t result = vrshrn_n_u16(
+ vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+ StoreLo4(dest, result);
+ dest += pred_stride;
+ StoreHi4(dest, result);
+ dest += pred_stride;
+
+ row = vget_high_u16(below);
+ y += 2;
+ } while (y < height);
+ } else {
+ uint8x8_t left = Load2(src);
+ uint8x8_t right = Load2(src + 1);
+ src += reference_stride;
+
+ uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+ int y = 0;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<2>(src, left);
+ right = Load2<2>(src + 1, right);
+ src += reference_stride;
+
+ const uint16x8_t below = vaddl_u8(left, right);
+
+ const uint8x8_t result = vrshrn_n_u16(
+ vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+ Store2<0>(dest, result);
+ dest += pred_stride;
+ Store2<2>(dest, result);
+ dest += pred_stride;
+
+ row = vget_high_u16(below);
+ y += 2;
+ } while (y < height);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+ dsp->convolve[0][0][1][1] = Convolve2D_NEON;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
+
+ dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+ dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+ dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/convolve_neon.h b/src/dsp/arm/convolve_neon.h
new file mode 100644
index 0000000..948ef4d
--- /dev/null
+++ b/src/dsp/arm/convolve_neon.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve. This function is not thread-safe.
+void ConvolveInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
new file mode 100644
index 0000000..04952ab
--- /dev/null
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -0,0 +1,203 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+ const int16x8_t pred1,
+ const int16x4_t weights[2]) {
+ // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+ const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0));
+ const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0));
+ const int32x4_t blended_lo =
+ vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1));
+ const int32x4_t blended_hi =
+ vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1));
+
+ return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4),
+ vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4));
+}
+
+template <int width, int height>
+inline void DistanceWeightedBlendSmall_NEON(const int16_t* prediction_0,
+ const int16_t* prediction_1,
+ const int16x4_t weights[2],
+ void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ constexpr int step = 16 / width;
+
+ for (int y = 0; y < height; y += step) {
+ const int16x8_t src_00 = vld1q_s16(prediction_0);
+ const int16x8_t src_10 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
+ const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+ const int16x8_t src_01 = vld1q_s16(prediction_0);
+ const int16x8_t src_11 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
+ const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+ const uint8x8_t result0 = vqmovun_s16(res0);
+ const uint8x8_t result1 = vqmovun_s16(res1);
+ if (width == 4) {
+ StoreLo4(dst, result0);
+ dst += dest_stride;
+ StoreHi4(dst, result0);
+ dst += dest_stride;
+ StoreLo4(dst, result1);
+ dst += dest_stride;
+ StoreHi4(dst, result1);
+ dst += dest_stride;
+ } else {
+ assert(width == 8);
+ vst1_u8(dst, result0);
+ dst += dest_stride;
+ vst1_u8(dst, result1);
+ dst += dest_stride;
+ }
+ }
+}
+
+inline void DistanceWeightedBlendLarge_NEON(const int16_t* prediction_0,
+ const int16_t* prediction_1,
+ const int16x4_t weights[2],
+ const int width, const int height,
+ void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
+ const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
+ const int16x8_t res_lo =
+ ComputeWeightedAverage8(src0_lo, src1_lo, weights);
+
+ const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
+ const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
+ const int16x8_t res_hi =
+ ComputeWeightedAverage8(src0_hi, src1_hi, weights);
+
+ const uint8x16_t result =
+ vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi));
+ vst1q_u8(dst + x, result);
+ x += 16;
+ } while (x < width);
+ dst += dest_stride;
+ prediction_0 += width;
+ prediction_1 += width;
+ } while (--y != 0);
+}
+
+inline void DistanceWeightedBlend_NEON(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0,
+ const uint8_t weight_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
+ // TODO(johannkoenig): Investigate the branching. May be fine to call with a
+ // variable height.
+ if (width == 4) {
+ if (height == 4) {
+ DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest,
+ dest_stride);
+ } else if (height == 8) {
+ DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest,
+ dest_stride);
+ } else {
+ assert(height == 16);
+ DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest,
+ dest_stride);
+ }
+ return;
+ }
+
+ if (width == 8) {
+ switch (height) {
+ case 4:
+ DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest,
+ dest_stride);
+ return;
+ case 8:
+ DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest,
+ dest_stride);
+ return;
+ case 16:
+ DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest,
+ dest_stride);
+ return;
+ default:
+ assert(height == 32);
+ DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest,
+ dest_stride);
+
+ return;
+ }
+ }
+
+ DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest,
+ dest_stride);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+} // namespace
+
+void DistanceWeightedBlendInit_NEON() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h
new file mode 100644
index 0000000..4d8824c
--- /dev/null
+++ b/src/dsp/arm/distance_weighted_blend_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If NEON is enabled signal the NEON implementation should be used instead of
+// normal C.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
new file mode 100644
index 0000000..2612466
--- /dev/null
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -0,0 +1,1188 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/arm/film_grain_neon.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// These functions are overloaded for both possible sizes in order to simplify
+// loading and storing to and from intermediate value types from within a
+// template function.
+inline int16x8_t GetSignedSource8(const int8_t* src) {
+ return vmovl_s8(vld1_s8(src));
+}
+
+inline int16x8_t GetSignedSource8(const uint8_t* src) {
+ return ZeroExtend(vld1_u8(src));
+}
+
+inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
+ vst1_u8(dest, vmovn_u16(data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
+
+inline int16x8_t GetSignedSource8(const uint16_t* src) {
+ return vreinterpretq_s16_u16(vld1q_u16(src));
+}
+
+inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
+ vst1q_u16(dest, data);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Each element in |sum| represents one destination value's running
+// autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
+// allow for a sliding window in successive calls to this function.
+template <int position_offset>
+inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
+ const int16x8_t grain_hi,
+ int16_t coeff, int32x4x2_t sum) {
+ const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
+ sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
+ sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
+ return sum;
+}
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum,
+ const int8_t* coeffs, int pos, int shift) {
+ int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+ for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+ result += grain_cursor[lane + delta_col] * coeffs[pos];
+ ++pos;
+ }
+ grain_cursor[lane] =
+ Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum,
+ const int8_t* coeffs, int pos, int shift) {
+ int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+ for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+ result += grain_cursor[lane + delta_col] * coeffs[pos];
+ ++pos;
+ }
+ grain_cursor[lane] =
+ Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor,
+ int8_t* v_grain_cursor,
+ int32x4x2_t sum_u, int32x4x2_t sum_v,
+ const int8_t* coeffs_u,
+ const int8_t* coeffs_v, int pos,
+ int shift) {
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ u_grain_cursor, sum_u, coeffs_u, pos, shift);
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor,
+ int16_t* v_grain_cursor,
+ int32x4x2_t sum_u, int32x4x2_t sum_v,
+ const int8_t* coeffs_u,
+ const int8_t* coeffs_v, int pos,
+ int shift) {
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ u_grain_cursor, sum_u, coeffs_u, pos, shift);
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline void SetZero(int32x4x2_t* v) {
+ v->val[0] = vdupq_n_s32(0);
+ v->val[1] = vdupq_n_s32(0);
+}
+
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
+ int subsampling_y, ptrdiff_t stride) {
+ if (subsampling_y != 0) {
+ assert(subsampling_x != 0);
+ const int8x16_t src0 = vld1q_s8(luma);
+ const int8x16_t src1 = vld1q_s8(luma + stride);
+ const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
+ vpaddl_s8(vget_high_s8(src0)));
+ const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
+ vpaddl_s8(vget_high_s8(src1)));
+ return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
+ }
+ if (subsampling_x != 0) {
+ const int8x16_t src = vld1q_s8(luma);
+ return vrshrq_n_s16(
+ vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
+ 1);
+ }
+ return vmovl_s8(vld1_s8(luma));
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ const uint8x16_t src = vld1q_u8(luma);
+ return vrshrq_n_u16(vpaddlq_u8(src), 1);
+ }
+ return vmovl_u8(vld1_u8(luma));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
+ int subsampling_y, ptrdiff_t stride) {
+ if (subsampling_y != 0) {
+ assert(subsampling_x != 0);
+ int16x8_t src0_lo = vld1q_s16(luma);
+ int16x8_t src0_hi = vld1q_s16(luma + 8);
+ const int16x8_t src1_lo = vld1q_s16(luma + stride);
+ const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
+ const int16x8_t src0 =
+ vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
+ vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
+ const int16x8_t src1 =
+ vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
+ vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
+ return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
+ }
+ if (subsampling_x != 0) {
+ const int16x8_t src_lo = vld1q_s16(luma);
+ const int16x8_t src_hi = vld1q_s16(luma + 8);
+ const int16x8_t ret =
+ vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
+ vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
+ return vrshrq_n_s16(ret, 1);
+ }
+ return vld1q_s16(luma);
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
+ int subsampling_x) {
+ if (subsampling_x != 0) {
+ const uint16x8x2_t src = vld2q_u16(luma);
+ return vrhaddq_u16(src.val[0], src.val[1]);
+ }
+ return vld1q_u16(luma);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+ bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams& params,
+ const void* luma_grain_buffer,
+ int subsampling_x,
+ int subsampling_y,
+ void* u_grain_buffer,
+ void* v_grain_buffer) {
+ static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
+ const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+ auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+ auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+ const int auto_regression_shift = params.auto_regression_shift;
+ const int chroma_width =
+ (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+ const int chroma_height =
+ (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+ // When |chroma_width| == 44, we write 8 at a time from x in [3, 34],
+ // leaving [35, 40] to write at the end.
+ const int chroma_width_remainder =
+ (chroma_width - 2 * kAutoRegressionBorder) & 7;
+
+ int y = kAutoRegressionBorder;
+ luma_grain += kLumaWidth * y;
+ u_grain += chroma_width * y;
+ v_grain += chroma_width * y;
+ do {
+ // Each row is computed 8 values at a time in the following loop. At the
+ // end of the loop, 4 values remain to write. They are given a special
+ // reduced iteration at the end.
+ int x = kAutoRegressionBorder;
+ int luma_x = kAutoRegressionBorder;
+ do {
+ int pos = 0;
+ int32x4x2_t sum_u;
+ int32x4x2_t sum_v;
+ SetZero(&sum_u);
+ SetZero(&sum_v);
+
+ if (auto_regression_coeff_lag > 0) {
+ for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+ ++delta_row) {
+ // These loads may overflow to the next row, but they are never called
+ // on the final row of a grain block. Therefore, they will never
+ // exceed the block boundaries.
+ // Note: this could be slightly optimized to a single load in 8bpp,
+ // but requires making a special first iteration and accumulate
+ // function that takes an int8x16_t.
+ const int16x8_t u_grain_lo =
+ GetSignedSource8(u_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag);
+ const int16x8_t u_grain_hi =
+ GetSignedSource8(u_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag + 8);
+ const int16x8_t v_grain_lo =
+ GetSignedSource8(v_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag);
+ const int16x8_t v_grain_hi =
+ GetSignedSource8(v_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag + 8);
+#define ACCUMULATE_WEIGHTED_GRAIN(offset) \
+ sum_u = AccumulateWeightedGrain<offset>( \
+ u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \
+ sum_v = AccumulateWeightedGrain<offset>( \
+ v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v)
+
+ ACCUMULATE_WEIGHTED_GRAIN(0);
+ ACCUMULATE_WEIGHTED_GRAIN(1);
+ ACCUMULATE_WEIGHTED_GRAIN(2);
+ // The horizontal |auto_regression_coeff_lag| loop is replaced with
+ // if-statements to give vextq_s16 an immediate param.
+ if (auto_regression_coeff_lag > 1) {
+ ACCUMULATE_WEIGHTED_GRAIN(3);
+ ACCUMULATE_WEIGHTED_GRAIN(4);
+ }
+ if (auto_regression_coeff_lag > 2) {
+ assert(auto_regression_coeff_lag == 3);
+ ACCUMULATE_WEIGHTED_GRAIN(5);
+ ACCUMULATE_WEIGHTED_GRAIN(6);
+ }
+ }
+ }
+
+ if (use_luma) {
+ const int16x8_t luma = GetSubsampledLuma(
+ luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+ // Luma samples get the final coefficient in the formula, but are best
+ // computed all at once before the final row.
+ const int coeff_u =
+ params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+ const int coeff_v =
+ params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+ sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+ sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+ sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+ sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+ }
+ // At this point in the filter, the source addresses and destination
+ // addresses overlap. Because this is an auto-regressive filter, the
+ // higher lanes cannot be computed without the results of the lower lanes.
+ // Each call to WriteFinalAutoRegression incorporates preceding values
+ // on the final row, and writes a single sample. This allows the next
+ // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane) \
+ WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>( \
+ u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \
+ params.auto_regression_coeff_v, pos, auto_regression_shift)
+
+ WRITE_AUTO_REGRESSION_RESULT(0);
+ WRITE_AUTO_REGRESSION_RESULT(1);
+ WRITE_AUTO_REGRESSION_RESULT(2);
+ WRITE_AUTO_REGRESSION_RESULT(3);
+ WRITE_AUTO_REGRESSION_RESULT(4);
+ WRITE_AUTO_REGRESSION_RESULT(5);
+ WRITE_AUTO_REGRESSION_RESULT(6);
+ WRITE_AUTO_REGRESSION_RESULT(7);
+
+ x += 8;
+ luma_x += 8 << subsampling_x;
+ } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder);
+
+ // This is the "final iteration" of the above loop over width. We fill in
+ // the remainder of the width, which is less than 8.
+ int pos = 0;
+ int32x4x2_t sum_u;
+ int32x4x2_t sum_v;
+ SetZero(&sum_u);
+ SetZero(&sum_v);
+
+ for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+ ++delta_row) {
+ // These loads may overflow to the next row, but they are never called on
+ // the final row of a grain block. Therefore, they will never exceed the
+ // block boundaries.
+ const int16x8_t u_grain_lo = GetSignedSource8(
+ u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+ const int16x8_t u_grain_hi =
+ GetSignedSource8(u_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag + 8);
+ const int16x8_t v_grain_lo = GetSignedSource8(
+ v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+ const int16x8_t v_grain_hi =
+ GetSignedSource8(v_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag + 8);
+
+ ACCUMULATE_WEIGHTED_GRAIN(0);
+ ACCUMULATE_WEIGHTED_GRAIN(1);
+ ACCUMULATE_WEIGHTED_GRAIN(2);
+ // The horizontal |auto_regression_coeff_lag| loop is replaced with
+ // if-statements to give vextq_s16 an immediate param.
+ if (auto_regression_coeff_lag > 1) {
+ ACCUMULATE_WEIGHTED_GRAIN(3);
+ ACCUMULATE_WEIGHTED_GRAIN(4);
+ }
+ if (auto_regression_coeff_lag > 2) {
+ assert(auto_regression_coeff_lag == 3);
+ ACCUMULATE_WEIGHTED_GRAIN(5);
+ ACCUMULATE_WEIGHTED_GRAIN(6);
+ }
+ }
+
+ if (use_luma) {
+ const int16x8_t luma = GetSubsampledLuma(
+ luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+ // Luma samples get the final coefficient in the formula, but are best
+ // computed all at once before the final row.
+ const int coeff_u =
+ params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+ const int coeff_v =
+ params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+ sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+ sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+ sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+ sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+ }
+
+ WRITE_AUTO_REGRESSION_RESULT(0);
+ WRITE_AUTO_REGRESSION_RESULT(1);
+ WRITE_AUTO_REGRESSION_RESULT(2);
+ WRITE_AUTO_REGRESSION_RESULT(3);
+ if (chroma_width_remainder == 6) {
+ WRITE_AUTO_REGRESSION_RESULT(4);
+ WRITE_AUTO_REGRESSION_RESULT(5);
+ }
+
+ luma_grain += kLumaWidth << subsampling_y;
+ u_grain += chroma_width;
+ v_grain += chroma_width;
+ } while (++y < chroma_height);
+#undef ACCUMULATE_WEIGHTED_GRAIN
+#undef WRITE_AUTO_REGRESSION_RESULT
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag>
+void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params,
+ void* luma_grain_buffer) {
+ static_assert(auto_regression_coeff_lag > 0, "");
+ const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
+ const uint8_t auto_regression_shift = params.auto_regression_shift;
+
+ int y = kAutoRegressionBorder;
+ auto* luma_grain =
+ static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y;
+ do {
+ // Each row is computed 8 values at a time in the following loop. At the
+ // end of the loop, 4 values remain to write. They are given a special
+ // reduced iteration at the end.
+ int x = kAutoRegressionBorder;
+ do {
+ int pos = 0;
+ int32x4x2_t sum;
+ SetZero(&sum);
+ for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+ ++delta_row) {
+ // These loads may overflow to the next row, but they are never called
+ // on the final row of a grain block. Therefore, they will never exceed
+ // the block boundaries.
+ const int16x8_t src_grain_lo =
+ GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+ auto_regression_coeff_lag);
+ const int16x8_t src_grain_hi =
+ GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+ auto_regression_coeff_lag + 8);
+
+ // A pictorial representation of the auto-regressive filter for
+ // various values of params.auto_regression_coeff_lag. The letter 'O'
+ // represents the current sample. (The filter always operates on the
+ // current sample with filter coefficient 1.) The letters 'X'
+ // represent the neighboring samples that the filter operates on, below
+ // their corresponding "offset" number.
+ //
+ // params.auto_regression_coeff_lag == 3:
+ // 0 1 2 3 4 5 6
+ // X X X X X X X
+ // X X X X X X X
+ // X X X X X X X
+ // X X X O
+ // params.auto_regression_coeff_lag == 2:
+ // 0 1 2 3 4
+ // X X X X X
+ // X X X X X
+ // X X O
+ // params.auto_regression_coeff_lag == 1:
+ // 0 1 2
+ // X X X
+ // X O
+ // params.auto_regression_coeff_lag == 0:
+ // O
+ // The function relies on the caller to skip the call in the 0 lag
+ // case.
+
+#define ACCUMULATE_WEIGHTED_GRAIN(offset) \
+ sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
+ auto_regression_coeff_y[pos++], sum)
+ ACCUMULATE_WEIGHTED_GRAIN(0);
+ ACCUMULATE_WEIGHTED_GRAIN(1);
+ ACCUMULATE_WEIGHTED_GRAIN(2);
+ // The horizontal |auto_regression_coeff_lag| loop is replaced with
+ // if-statements to give vextq_s16 an immediate param.
+ if (auto_regression_coeff_lag > 1) {
+ ACCUMULATE_WEIGHTED_GRAIN(3);
+ ACCUMULATE_WEIGHTED_GRAIN(4);
+ }
+ if (auto_regression_coeff_lag > 2) {
+ assert(auto_regression_coeff_lag == 3);
+ ACCUMULATE_WEIGHTED_GRAIN(5);
+ ACCUMULATE_WEIGHTED_GRAIN(6);
+ }
+ }
+ // At this point in the filter, the source addresses and destination
+ // addresses overlap. Because this is an auto-regressive filter, the
+ // higher lanes cannot be computed without the results of the lower lanes.
+ // Each call to WriteFinalAutoRegression incorporates preceding values
+ // on the final row, and writes a single sample. This allows the next
+ // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane) \
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
+ luma_grain + x, sum, auto_regression_coeff_y, pos, \
+ auto_regression_shift)
+
+ WRITE_AUTO_REGRESSION_RESULT(0);
+ WRITE_AUTO_REGRESSION_RESULT(1);
+ WRITE_AUTO_REGRESSION_RESULT(2);
+ WRITE_AUTO_REGRESSION_RESULT(3);
+ WRITE_AUTO_REGRESSION_RESULT(4);
+ WRITE_AUTO_REGRESSION_RESULT(5);
+ WRITE_AUTO_REGRESSION_RESULT(6);
+ WRITE_AUTO_REGRESSION_RESULT(7);
+ x += 8;
+ // Leave the final four pixels for the special iteration below.
+ } while (x < kLumaWidth - kAutoRegressionBorder - 4);
+
+ // Final 4 pixels in the row.
+ int pos = 0;
+ int32x4x2_t sum;
+ SetZero(&sum);
+ for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+ ++delta_row) {
+ const int16x8_t src_grain_lo = GetSignedSource8(
+ luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
+ const int16x8_t src_grain_hi =
+ GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+ auto_regression_coeff_lag + 8);
+
+ ACCUMULATE_WEIGHTED_GRAIN(0);
+ ACCUMULATE_WEIGHTED_GRAIN(1);
+ ACCUMULATE_WEIGHTED_GRAIN(2);
+ // The horizontal |auto_regression_coeff_lag| loop is replaced with
+ // if-statements to give vextq_s16 an immediate param.
+ if (auto_regression_coeff_lag > 1) {
+ ACCUMULATE_WEIGHTED_GRAIN(3);
+ ACCUMULATE_WEIGHTED_GRAIN(4);
+ }
+ if (auto_regression_coeff_lag > 2) {
+ assert(auto_regression_coeff_lag == 3);
+ ACCUMULATE_WEIGHTED_GRAIN(5);
+ ACCUMULATE_WEIGHTED_GRAIN(6);
+ }
+ }
+ // delta_row == 0
+ WRITE_AUTO_REGRESSION_RESULT(0);
+ WRITE_AUTO_REGRESSION_RESULT(1);
+ WRITE_AUTO_REGRESSION_RESULT(2);
+ WRITE_AUTO_REGRESSION_RESULT(3);
+ luma_grain += kLumaWidth;
+ } while (++y < kLumaHeight);
+
+#undef WRITE_AUTO_REGRESSION_RESULT
+#undef ACCUMULATE_WEIGHTED_GRAIN
+}
+
+void InitializeScalingLookupTable_NEON(
+ int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
+ uint8_t scaling_lut[kScalingLookupTableSize]) {
+ if (num_points == 0) {
+ memset(scaling_lut, 0, sizeof(scaling_lut[0]) * kScalingLookupTableSize);
+ return;
+ }
+ static_assert(sizeof(scaling_lut[0]) == 1, "");
+ memset(scaling_lut, point_scaling[0], point_value[0]);
+ const uint32x4_t steps = vmovl_u16(vcreate_u16(0x0003000200010000));
+ const uint32x4_t offset = vdupq_n_u32(32768);
+ for (int i = 0; i < num_points - 1; ++i) {
+ const int delta_y = point_scaling[i + 1] - point_scaling[i];
+ const int delta_x = point_value[i + 1] - point_value[i];
+ const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+ const int delta4 = delta << 2;
+ const uint8x8_t base_point = vdup_n_u8(point_scaling[i]);
+ uint32x4_t upscaled_points0 = vmlaq_n_u32(offset, steps, delta);
+ const uint32x4_t line_increment4 = vdupq_n_u32(delta4);
+ // Get the second set of 4 points by adding 4 steps to the first set.
+ uint32x4_t upscaled_points1 = vaddq_u32(upscaled_points0, line_increment4);
+ // We obtain the next set of 8 points by adding 8 steps to each of the
+ // current 8 points.
+ const uint32x4_t line_increment8 = vshlq_n_u32(line_increment4, 1);
+ int x = 0;
+ do {
+ const uint16x4_t interp_points0 = vshrn_n_u32(upscaled_points0, 16);
+ const uint16x4_t interp_points1 = vshrn_n_u32(upscaled_points1, 16);
+ const uint8x8_t interp_points =
+ vmovn_u16(vcombine_u16(interp_points0, interp_points1));
+ // The spec guarantees that the max value of |point_value[i]| + x is 255.
+ // Writing 8 bytes starting at the final table byte, leaves 7 bytes of
+ // required padding.
+ vst1_u8(&scaling_lut[point_value[i] + x],
+ vadd_u8(interp_points, base_point));
+ upscaled_points0 = vaddq_u32(upscaled_points0, line_increment8);
+ upscaled_points1 = vaddq_u32(upscaled_points1, line_increment8);
+ x += 8;
+ } while (x < delta_x);
+ }
+ const uint8_t last_point_value = point_value[num_points - 1];
+ memset(&scaling_lut[last_point_value], point_scaling[num_points - 1],
+ kScalingLookupTableSize - last_point_value);
+}
+
+inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
+ const int16x8_t high) {
+ const int16x8_t clipped_to_ceiling = vminq_s16(high, value);
+ return vmaxq_s16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline int16x8_t GetScalingFactors(
+ const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+ int16_t start_vals[8];
+ if (bitdepth == 8) {
+ start_vals[0] = scaling_lut[source[0]];
+ start_vals[1] = scaling_lut[source[1]];
+ start_vals[2] = scaling_lut[source[2]];
+ start_vals[3] = scaling_lut[source[3]];
+ start_vals[4] = scaling_lut[source[4]];
+ start_vals[5] = scaling_lut[source[5]];
+ start_vals[6] = scaling_lut[source[6]];
+ start_vals[7] = scaling_lut[source[7]];
+ return vld1q_s16(start_vals);
+ }
+ int16_t end_vals[8];
+ // TODO(petersonab): Precompute this into a larger table for direct lookups.
+ int index = source[0] >> 2;
+ start_vals[0] = scaling_lut[index];
+ end_vals[0] = scaling_lut[index + 1];
+ index = source[1] >> 2;
+ start_vals[1] = scaling_lut[index];
+ end_vals[1] = scaling_lut[index + 1];
+ index = source[2] >> 2;
+ start_vals[2] = scaling_lut[index];
+ end_vals[2] = scaling_lut[index + 1];
+ index = source[3] >> 2;
+ start_vals[3] = scaling_lut[index];
+ end_vals[3] = scaling_lut[index + 1];
+ index = source[4] >> 2;
+ start_vals[4] = scaling_lut[index];
+ end_vals[4] = scaling_lut[index + 1];
+ index = source[5] >> 2;
+ start_vals[5] = scaling_lut[index];
+ end_vals[5] = scaling_lut[index + 1];
+ index = source[6] >> 2;
+ start_vals[6] = scaling_lut[index];
+ end_vals[6] = scaling_lut[index + 1];
+ index = source[7] >> 2;
+ start_vals[7] = scaling_lut[index];
+ end_vals[7] = scaling_lut[index + 1];
+ const int16x8_t start = vld1q_s16(start_vals);
+ const int16x8_t end = vld1q_s16(end_vals);
+ int16x8_t remainder = GetSignedSource8(source);
+ remainder = vandq_s16(remainder, vdupq_n_s16(3));
+ const int16x8_t delta = vmulq_s16(vsubq_s16(end, start), remainder);
+ return vaddq_s16(start, vrshrq_n_s16(delta, 2));
+}
+
+inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
+ const int16x8_t scaling_shift_vect) {
+ const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
+ return vrshlq_s16(upscaled_noise, scaling_shift_vect);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
+ const int32x4_t scaling_shift_vect) {
+ // TODO(petersonab): Try refactoring scaling lookup table to int16_t and
+ // upscaling by 7 bits to permit high half multiply. This would eliminate
+ // the intermediate 32x4 registers. Also write the averaged values directly
+ // into the table so it doesn't have to be done for every pixel in
+ // the frame.
+ const int32x4_t upscaled_noise_lo =
+ vmull_s16(vget_low_s16(noise), vget_low_s16(scaling));
+ const int32x4_t upscaled_noise_hi =
+ vmull_s16(vget_high_s16(noise), vget_high_s16(scaling));
+ const int16x4_t noise_lo =
+ vmovn_s32(vrshlq_s32(upscaled_noise_lo, scaling_shift_vect));
+ const int16x4_t noise_hi =
+ vmovn_s32(vrshlq_s32(upscaled_noise_hi, scaling_shift_vect));
+ return vcombine_s16(noise_lo, noise_hi);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_NEON(
+ const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
+ int width, int height, int start_height,
+ const uint8_t scaling_lut_y[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
+ ptrdiff_t dest_stride_y) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+ dest_stride_y /= sizeof(Pixel);
+ const int16x8_t floor = vdupq_n_s16(min_value);
+ const int16x8_t ceiling = vdupq_n_s16(max_luma);
+ // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+ // for 16 bit signed integers. In higher bitdepths, however, we have to
+ // expand to 32 to protect the sign bit.
+ const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift);
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ // This operation on the unsigned input is safe in 8bpp because the vector
+ // is widened before it is reinterpreted.
+ const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
+ const int16x8_t scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ int16x8_t noise =
+ GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+ if (bitdepth == 8) {
+ noise = ScaleNoise(noise, scaling, scaling_shift_vect16);
+ } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ noise = ScaleNoise(noise, scaling, scaling_shift_vect32);
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ }
+ const int16x8_t combined = vaddq_s16(orig, noise);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+ // clipping with vqmovun_s16, but it's not likely to be worth copying the
+ // function for just that case, though the gain would be very small.
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+ x += 8;
+ } while (x < width);
+ in_y_row += source_stride_y;
+ out_y_row += dest_stride_y;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline int16x8_t BlendChromaValsWithCfl(
+ const Pixel* average_luma_buffer,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+ const int16x8_t scaling_shift_vect16,
+ const int32x4_t scaling_shift_vect32) {
+ const int16x8_t scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+ const int16x8_t orig = GetSignedSource8(chroma_cursor);
+ int16x8_t noise = GetSignedSource8(noise_image_cursor);
+ if (bitdepth == 8) {
+ noise = ScaleNoise(noise, scaling, scaling_shift_vect16);
+ } else {
+ noise = ScaleNoise(noise, scaling, scaling_shift_vect32);
+ }
+ return vaddq_s16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
+ const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift,
+ const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
+ ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
+ ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
+ ptrdiff_t dest_stride) {
+ const int16x8_t floor = vdupq_n_s16(min_value);
+ const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+ Pixel luma_buffer[16];
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+ // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+ // for 16 bit signed integers. In higher bitdepths, however, we have to
+ // expand to 32 to protect the sign bit.
+ const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift);
+ const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int safe_chroma_width = chroma_width & ~7;
+
+ // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+ // in GetScalingFactors.
+ Pixel average_luma_buffer[8];
+ assert(start_height % 2 == 0);
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int luma_x = x << subsampling_x;
+ // TODO(petersonab): Consider specializing by subsampling_x. In the 444
+ // case &in_y_row[x] can be passed to GetScalingFactors directly.
+ const uint16x8_t average_luma =
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+ StoreUnsigned8(average_luma_buffer, average_luma);
+
+ const int16x8_t blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ average_luma_buffer, scaling_lut, &in_chroma_row[x],
+ &(noise_image[y + start_height][x]), scaling_shift_vect16,
+ scaling_shift_vect32);
+
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+ // clipping with vqmovun_s16, but it's not likely to be worth copying the
+ // function for just that case.
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ x += 8;
+ } while (x < safe_chroma_width);
+
+ if (x < chroma_width) {
+ const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const uint16x8_t average_luma =
+ GetAverageLuma(luma_buffer, subsampling_x);
+ StoreUnsigned8(average_luma_buffer, average_luma);
+
+ const int16x8_t blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ average_luma_buffer, scaling_lut, &in_chroma_row[x],
+ &(noise_image[y + start_height][x]), scaling_shift_vect16,
+ scaling_shift_vect32);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+ // clipping with vqmovun_s16, but it's not likely to be worth copying the
+ // function for just that case.
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_NEON(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_chroma, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+ // Looping over one plane at a time is faster in higher resolutions, despite
+ // re-computing luma.
+ BlendChromaPlaneWithCfl_NEON<bitdepth, GrainType, Pixel>(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+ source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline int16x8_t BlendChromaValsNoCfl(
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const uint8_t* chroma_cursor, const int8_t* noise_image_cursor,
+ const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+ const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) {
+ uint8_t merged_buffer[8];
+ const int16x8_t orig = GetSignedSource8(chroma_cursor);
+ const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
+ const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
+ // Maximum value of |combined_u| is 127*255 = 0x7E81.
+ const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma);
+ // Maximum value of u_offset is (255 << 5) = 0x1FE0.
+ // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
+ const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
+ vst1_u8(merged_buffer, merged);
+ const int16x8_t scaling =
+ GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+ int16x8_t noise = GetSignedSource8(noise_image_cursor);
+ noise = ScaleNoise(noise, scaling, scaling_shift_vect);
+ return vaddq_s16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
+ const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift, int chroma_offset,
+ int chroma_multiplier, int luma_multiplier,
+ const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
+ ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
+ ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
+ ptrdiff_t dest_stride) {
+ const int16x8_t floor = vdupq_n_s16(min_value);
+ const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+ // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+ // for 16 bit signed integers. In higher bitdepths, however, we have to
+ // expand to 32 to protect the sign bit.
+ const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int safe_chroma_width = chroma_width & ~7;
+ uint8_t luma_buffer[16];
+ const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
+
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int luma_x = x << subsampling_x;
+ const int16x8_t average_luma = vreinterpretq_s16_u16(
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x));
+ const int16x8_t blended = BlendChromaValsNoCfl(
+ scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]),
+ average_luma, scaling_shift_vect, offset, luma_multiplier,
+ chroma_multiplier);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can
+ // replace clipping with vqmovun_s16, but the gain would be small.
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+
+ x += 8;
+ } while (x < safe_chroma_width);
+
+ if (x < chroma_width) {
+ // Begin right edge iteration. Same as the normal iterations, but the
+ // |average_luma| computation requires a duplicated luma value at the
+ // end.
+ const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+
+ const int16x8_t average_luma =
+ vreinterpretq_s16_u16(GetAverageLuma(luma_buffer, subsampling_x));
+ const int16x8_t blended = BlendChromaValsNoCfl(
+ scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]),
+ average_luma, scaling_shift_vect, offset, luma_multiplier,
+ chroma_multiplier);
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ // End of right edge iteration.
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_NEON(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_chroma, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ assert(plane == kPlaneU || plane == kPlaneV);
+ const auto* noise_image =
+ static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+ const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+ auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+ BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width,
+ height, start_height, subsampling_x, subsampling_y,
+ params.chroma_scaling, offset, multiplier,
+ luma_multiplier, scaling_lut, in_y, source_stride_y,
+ in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+inline void WriteOverlapLine8bpp_NEON(const int8_t* noise_stripe_row,
+ const int8_t* noise_stripe_row_prev,
+ int plane_width,
+ const int8x8_t grain_coeff,
+ const int8x8_t old_coeff,
+ int8_t* noise_image_row) {
+ int x = 0;
+ do {
+ // Note that these reads may exceed noise_stripe_row's width by up to 7
+ // bytes.
+ const int8x8_t source_grain = vld1_s8(noise_stripe_row + x);
+ const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x);
+ const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain);
+ const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old);
+ // Note that this write may exceed noise_image_row's width by up to 7 bytes.
+ vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5));
+ x += 8;
+ } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap8bpp_NEON(const void* noise_stripes_buffer,
+ int width, int height,
+ int subsampling_x, int subsampling_y,
+ void* noise_image_buffer) {
+ const auto* noise_stripes =
+ static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
+ auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ const int plane_height = (height + subsampling_y) >> subsampling_y;
+ const int stripe_height = 32 >> subsampling_y;
+ const int stripe_mask = stripe_height - 1;
+ int y = stripe_height;
+ int luma_num = 1;
+ if (subsampling_y == 0) {
+ const int8x8_t first_row_grain_coeff = vdup_n_s8(17);
+ const int8x8_t first_row_old_coeff = vdup_n_s8(27);
+ const int8x8_t second_row_grain_coeff = first_row_old_coeff;
+ const int8x8_t second_row_old_coeff = first_row_grain_coeff;
+ for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+ const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine8bpp_NEON(
+ noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+ WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, second_row_grain_coeff,
+ second_row_old_coeff, (*noise_image)[y + 1]);
+ }
+ // Either one partial stripe remains (remaining_height > 0),
+ // OR image is less than one stripe high (remaining_height < 0),
+ // OR all stripes are completed (remaining_height == 0).
+ const int remaining_height = plane_height - y;
+ if (remaining_height <= 0) {
+ return;
+ }
+ const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine8bpp_NEON(
+ noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+ if (remaining_height > 1) {
+ WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, second_row_grain_coeff,
+ second_row_old_coeff, (*noise_image)[y + 1]);
+ }
+ } else { // subsampling_y == 1
+ const int8x8_t first_row_grain_coeff = vdup_n_s8(22);
+ const int8x8_t first_row_old_coeff = vdup_n_s8(23);
+ for (; y < plane_height; ++luma_num, y += stripe_height) {
+ const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine8bpp_NEON(
+ noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 1>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 2>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 3>;
+
+ // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag]
+ // Chroma autoregression should never be called when lag is 0 and use_luma
+ // is false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, true>;
+
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap8bpp_NEON;
+
+ dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON;
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_NEON<8, int8_t, uint8_t>;
+ dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_NEON<8, int8_t, uint8_t>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 1>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 2>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 3>;
+
+ // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling]
+ // Chroma autoregression should never be called when lag is 0 and use_luma
+ // is false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, true>;
+
+ dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON;
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_NEON<10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_NEON<10, int16_t, uint16_t>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace film_grain
+
+void FilmGrainInit_NEON() {
+ film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ film_grain::high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h
new file mode 100644
index 0000000..44b3d1d
--- /dev/null
+++ b/src/dsp/arm/film_grain_neon.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc
new file mode 100644
index 0000000..00b186a
--- /dev/null
+++ b/src/dsp/arm/intra_edge_neon.cc
@@ -0,0 +1,301 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h" // RightShiftWithRounding()
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Simplified version of intra_edge.cc:kKernels[][]. Only |strength| 1 and 2 are
+// required.
+constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+ assert(strength == 1 || strength == 2 || strength == 3);
+ const int kernel_index = strength - 1;
+ auto* const dst_buffer = static_cast<uint8_t*>(buffer);
+
+ // The first element is not written out (but it is input) so the number of
+ // elements written is |size| - 1.
+ if (size == 1) return;
+
+ // |strength| 1 and 2 use a 3 tap filter.
+ if (strength < 3) {
+ // The last value requires extending the buffer (duplicating
+ // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+ // neon.
+ const uint8_t last_val = RightShiftWithRounding(
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+ kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+ 4);
+
+ const uint8x8_t krn1 = vdup_n_u8(kKernelsNEON[kernel_index][1]);
+
+ // The first value we need gets overwritten by the output from the
+ // previous iteration.
+ uint8x16_t src_0 = vld1q_u8(dst_buffer);
+ int i = 1;
+
+ // Process blocks until there are less than 16 values remaining.
+ for (; i < size - 15; i += 16) {
+ // Loading these at the end of the block with |src_0| will read past the
+ // end of |top_row_data[160]|, the source of |buffer|.
+ const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+ const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+ uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+ sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+ sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+ uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+ sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+ sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+ const uint8x16_t result =
+ vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+ // Load the next row before overwriting. This loads an extra 15 values
+ // past |size| on the trailing iteration.
+ src_0 = vld1q_u8(dst_buffer + i + 15);
+
+ vst1q_u8(dst_buffer + i, result);
+ }
+
+ // The last output value |last_val| was already calculated so if
+ // |remainder| == 1 then we don't have to do anything.
+ const int remainder = (size - 1) & 0xf;
+ if (remainder > 1) {
+ uint8_t temp[16];
+ const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+ const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+
+ uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+ sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+ sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+ uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+ sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+ sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+ const uint8x16_t result =
+ vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+ vst1q_u8(temp, result);
+ memcpy(dst_buffer + i, temp, remainder);
+ }
+
+ dst_buffer[size - 1] = last_val;
+ return;
+ }
+
+ assert(strength == 3);
+ // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+ // last two elements require duplicating |buffer[size - 1]|.
+ uint8_t special_vals[3];
+ special_vals[0] = RightShiftWithRounding(
+ (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+ (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+ 4);
+ // Clamp index for very small |size| values.
+ const int first_index_min = std::max(size - 4, 0);
+ const int second_index_min = std::max(size - 3, 0);
+ const int third_index_min = std::max(size - 2, 0);
+ special_vals[1] = RightShiftWithRounding(
+ (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+ (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+ (dst_buffer[size - 1] << 1),
+ 4);
+ special_vals[2] = RightShiftWithRounding(
+ (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+ // x << 2 + x << 2 == x << 3
+ (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+ 4);
+
+ // The first two values we need get overwritten by the output from the
+ // previous iteration.
+ uint8x16_t src_0 = vld1q_u8(dst_buffer - 1);
+ uint8x16_t src_1 = vld1q_u8(dst_buffer);
+ int i = 1;
+
+ for (; i < size - 15; i += 16) {
+ // Loading these at the end of the block with |src_[01]| will read past
+ // the end of |top_row_data[160]|, the source of |buffer|.
+ const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+ const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+ const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+ uint16x8_t sum_lo =
+ vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+ const uint16x8_t sum_123_lo = vaddw_u8(
+ vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+ sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+ uint16x8_t sum_hi =
+ vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+ const uint16x8_t sum_123_hi =
+ vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+ vget_high_u8(src_3));
+ sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+ const uint8x16_t result =
+ vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+ src_0 = vld1q_u8(dst_buffer + i + 14);
+ src_1 = vld1q_u8(dst_buffer + i + 15);
+
+ vst1q_u8(dst_buffer + i, result);
+ }
+
+ const int remainder = (size - 1) & 0xf;
+ // Like the 3 tap but if there are two remaining values we have already
+ // calculated them.
+ if (remainder > 2) {
+ uint8_t temp[16];
+ const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+ const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+ const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+ uint16x8_t sum_lo =
+ vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+ const uint16x8_t sum_123_lo = vaddw_u8(
+ vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+ sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+ uint16x8_t sum_hi =
+ vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+ const uint16x8_t sum_123_hi =
+ vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+ vget_high_u8(src_3));
+ sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+ const uint8x16_t result =
+ vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+ vst1q_u8(temp, result);
+ memcpy(dst_buffer + i, temp, remainder);
+ }
+
+ dst_buffer[1] = special_vals[0];
+ // Avoid overwriting |dst_buffer[0]|.
+ if (size > 2) dst_buffer[size - 2] = special_vals[1];
+ dst_buffer[size - 1] = special_vals[2];
+}
+
+// (-|src0| + |src1| * 9 + |src2| * 9 - |src3|) >> 4
+uint8x8_t Upsample(const uint8x8_t src0, const uint8x8_t src1,
+ const uint8x8_t src2, const uint8x8_t src3) {
+ const uint16x8_t middle = vmulq_n_u16(vaddl_u8(src1, src2), 9);
+ const uint16x8_t ends = vaddl_u8(src0, src3);
+ const int16x8_t sum =
+ vsubq_s16(vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(ends));
+ return vqrshrun_n_s16(sum, 4);
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+ assert(size % 4 == 0 && size <= 16);
+ auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+ // This is OK because we don't read this value for |size| 4 or 8 but if we
+ // write |pixel_buffer[size]| and then vld() it, that seems to introduce
+ // some latency.
+ pixel_buffer[-2] = pixel_buffer[-1];
+ if (size == 4) {
+ // This uses one load and two vtbl() which is better than 4x Load{Lo,Hi}4().
+ const uint8x8_t src = vld1_u8(pixel_buffer - 1);
+ // The outside values are negated so put those in the same vector.
+ const uint8x8_t src03 = vtbl1_u8(src, vcreate_u8(0x0404030202010000));
+ // Reverse |src1| and |src2| so we can use |src2| for the interleave at the
+ // end.
+ const uint8x8_t src21 = vtbl1_u8(src, vcreate_u8(0x0302010004030201));
+
+ const uint16x8_t middle = vmull_u8(src21, vdup_n_u8(9));
+ const int16x8_t half_sum = vsubq_s16(
+ vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(vmovl_u8(src03)));
+ const int16x4_t sum =
+ vadd_s16(vget_low_s16(half_sum), vget_high_s16(half_sum));
+ const uint8x8_t result = vqrshrun_n_s16(vcombine_s16(sum, sum), 4);
+
+ vst1_u8(pixel_buffer - 1, InterleaveLow8(result, src21));
+ return;
+ } else if (size == 8) {
+ // Likewise, one load + multiple vtbls seems preferred to multiple loads.
+ const uint8x16_t src = vld1q_u8(pixel_buffer - 1);
+ const uint8x8_t src0 = VQTbl1U8(src, vcreate_u8(0x0605040302010000));
+ const uint8x8_t src1 = vget_low_u8(src);
+ const uint8x8_t src2 = VQTbl1U8(src, vcreate_u8(0x0807060504030201));
+ const uint8x8_t src3 = VQTbl1U8(src, vcreate_u8(0x0808070605040302));
+
+ const uint8x8x2_t output = {Upsample(src0, src1, src2, src3), src2};
+ vst2_u8(pixel_buffer - 1, output);
+ return;
+ }
+ assert(size == 12 || size == 16);
+ // Extend the input borders to avoid branching later.
+ pixel_buffer[size] = pixel_buffer[size - 1];
+ const uint8x16_t src0 = vld1q_u8(pixel_buffer - 2);
+ const uint8x16_t src1 = vld1q_u8(pixel_buffer - 1);
+ const uint8x16_t src2 = vld1q_u8(pixel_buffer);
+ const uint8x16_t src3 = vld1q_u8(pixel_buffer + 1);
+
+ const uint8x8_t result_lo = Upsample(vget_low_u8(src0), vget_low_u8(src1),
+ vget_low_u8(src2), vget_low_u8(src3));
+
+ const uint8x8x2_t output_lo = {result_lo, vget_low_u8(src2)};
+ vst2_u8(pixel_buffer - 1, output_lo);
+
+ const uint8x8_t result_hi = Upsample(vget_high_u8(src0), vget_high_u8(src1),
+ vget_high_u8(src2), vget_high_u8(src3));
+
+ if (size == 12) {
+ vst1_u8(pixel_buffer + 15, InterleaveLow8(result_hi, vget_high_u8(src2)));
+ } else /* size == 16 */ {
+ const uint8x8x2_t output_hi = {result_hi, vget_high_u8(src2)};
+ vst2_u8(pixel_buffer + 15, output_hi);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+} // namespace
+
+void IntraEdgeInit_NEON() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h
new file mode 100644
index 0000000..d3bb243
--- /dev/null
+++ b/src/dsp/arm/intra_edge_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
new file mode 100644
index 0000000..45fe33b
--- /dev/null
+++ b/src/dsp/arm/intrapred_cfl_neon.cc
@@ -0,0 +1,479 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+uint8x16_t Set2ValuesQ(const uint8_t* a) {
+ uint16_t combined_values = a[0] | a[1] << 8;
+ return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
+}
+
+uint32_t SumVector(uint32x2_t a) {
+#if defined(__aarch64__)
+ return vaddv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif // defined(__aarch64__)
+}
+
+uint32_t SumVector(uint32x4_t a) {
+#if defined(__aarch64__)
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+ return vget_lane_u32(vreinterpret_u32_u64(c), 0);
+#endif // defined(__aarch64__)
+}
+
+// Divide by the number of elements.
+uint32_t Average(const uint32_t sum, const int width, const int height) {
+ return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
+}
+
+// Subtract |val| from every element in |a|.
+void BlockSubtract(const uint32_t val,
+ int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int width, const int height) {
+ assert(val <= INT16_MAX);
+ const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
+
+ for (int y = 0; y < height; ++y) {
+ if (width == 4) {
+ const int16x4_t b = vld1_s16(a[y]);
+ vst1_s16(a[y], vsub_s16(b, vget_low_s16(val_v)));
+ } else if (width == 8) {
+ const int16x8_t b = vld1q_s16(a[y]);
+ vst1q_s16(a[y], vsubq_s16(b, val_v));
+ } else if (width == 16) {
+ const int16x8_t b = vld1q_s16(a[y]);
+ const int16x8_t c = vld1q_s16(a[y] + 8);
+ vst1q_s16(a[y], vsubq_s16(b, val_v));
+ vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+ } else /* block_width == 32 */ {
+ const int16x8_t b = vld1q_s16(a[y]);
+ const int16x8_t c = vld1q_s16(a[y] + 8);
+ const int16x8_t d = vld1q_s16(a[y] + 16);
+ const int16x8_t e = vld1q_s16(a[y] + 24);
+ vst1q_s16(a[y], vsubq_s16(b, val_v));
+ vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+ vst1q_s16(a[y] + 16, vsubq_s16(d, val_v));
+ vst1q_s16(a[y] + 24, vsubq_s16(e, val_v));
+ }
+ }
+}
+
+template <int block_width, int block_height>
+void CflSubsampler420_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, const ptrdiff_t stride) {
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t sum;
+ if (block_width == 4) {
+ assert(max_luma_width >= 8);
+ uint32x2_t running_sum = vdup_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ const uint8x8_t row0 = vld1_u8(src);
+ const uint8x8_t row1 = vld1_u8(src + stride);
+
+ uint16x4_t sum_row = vpadal_u8(vpaddl_u8(row0), row1);
+ sum_row = vshl_n_u16(sum_row, 1);
+ running_sum = vpadal_u16(running_sum, sum_row);
+ vst1_s16(luma[y], vreinterpret_s16_u16(sum_row));
+
+ if (y << 1 < max_luma_height - 2) {
+ // Once this threshold is reached the loop could be simplified.
+ src += stride << 1;
+ }
+ }
+
+ sum = SumVector(running_sum);
+ } else if (block_width == 8) {
+ const uint8x16_t x_index = {0, 0, 2, 2, 4, 4, 6, 6,
+ 8, 8, 10, 10, 12, 12, 14, 14};
+ const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+ const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+
+ uint32x4_t running_sum = vdupq_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
+ const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);
+
+ uint8x16_t row0 = vld1q_u8(src);
+ row0 = vbslq_u8(x_mask, row0, x_max0);
+ uint8x16_t row1 = vld1q_u8(src + stride);
+ row1 = vbslq_u8(x_mask, row1, x_max1);
+
+ uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ sum_row = vshlq_n_u16(sum_row, 1);
+ running_sum = vpadalq_u16(running_sum, sum_row);
+ vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));
+
+ if (y << 1 < max_luma_height - 2) {
+ src += stride << 1;
+ }
+ }
+
+ sum = SumVector(running_sum);
+ } else /* block_width >= 16 */ {
+ const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+ uint32x4_t running_sum = vdupq_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ uint8x16_t x_index = {0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30};
+ const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
+ const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
+ const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
+ const uint8x16_t x_max11 =
+ vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
+ for (int x = 0; x < block_width; x += 16) {
+ const ptrdiff_t src_x_offset = x << 1;
+ const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+ const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
+ const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
+ const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
+ const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
+ const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
+ const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);
+
+ uint16x8_t sum_row_lo =
+ vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
+ sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
+ sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
+ sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
+ running_sum = vpadalq_u16(running_sum, sum_row_lo);
+ vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));
+
+ uint16x8_t sum_row_hi =
+ vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
+ sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
+ sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
+ sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
+ running_sum = vpadalq_u16(running_sum, sum_row_hi);
+ vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));
+
+ x_index = vaddq_u8(x_index, vdupq_n_u8(32));
+ }
+ if (y << 1 < max_luma_height - 2) {
+ src += stride << 1;
+ }
+ }
+ sum = SumVector(running_sum);
+ }
+
+ const uint32_t average = Average(sum, block_width, block_height);
+ BlockSubtract(average, luma, block_width, block_height);
+}
+
+template <int block_width, int block_height>
+void CflSubsampler444_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, const ptrdiff_t stride) {
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t sum;
+ if (block_width == 4) {
+ assert(max_luma_width >= 4);
+ uint32x4_t running_sum = vdupq_n_u32(0);
+ uint8x8_t row = vdup_n_u8(0);
+
+ for (int y = 0; y < block_height; y += 2) {
+ row = Load4<0>(src, row);
+ row = Load4<1>(src + stride, row);
+ if (y < (max_luma_height - 1)) {
+ src += stride << 1;
+ }
+
+ const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted);
+ vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+ vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+ }
+
+ sum = SumVector(running_sum);
+ } else if (block_width == 8) {
+ const uint8x8_t x_index = {0, 1, 2, 3, 4, 5, 6, 7};
+ const uint8x8_t x_max_index = vdup_n_u8(max_luma_width - 1);
+ const uint8x8_t x_mask = vclt_u8(x_index, x_max_index);
+
+ uint32x4_t running_sum = vdupq_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ const uint8x8_t x_max = vdup_n_u8(src[max_luma_width - 1]);
+ const uint8x8_t row = vbsl_u8(x_mask, vld1_u8(src), x_max);
+
+ const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted);
+ vst1q_s16(luma[y], vreinterpretq_s16_u16(row_shifted));
+
+ if (y < max_luma_height - 1) {
+ src += stride;
+ }
+ }
+
+ sum = SumVector(running_sum);
+ } else /* block_width >= 16 */ {
+ const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 1);
+ uint32x4_t running_sum = vdupq_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ uint8x16_t x_index = {0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15};
+ const uint8x16_t x_max = vdupq_n_u8(src[max_luma_width - 1]);
+ for (int x = 0; x < block_width; x += 16) {
+ const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+ const uint8x16_t row = vbslq_u8(x_mask, vld1q_u8(src + x), x_max);
+
+ const uint16x8_t row_shifted_low = vshll_n_u8(vget_low_u8(row), 3);
+ const uint16x8_t row_shifted_high = vshll_n_u8(vget_high_u8(row), 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted_low);
+ running_sum = vpadalq_u16(running_sum, row_shifted_high);
+ vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(row_shifted_low));
+ vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(row_shifted_high));
+
+ x_index = vaddq_u8(x_index, vdupq_n_u8(16));
+ }
+ if (y < max_luma_height - 1) {
+ src += stride;
+ }
+ }
+ sum = SumVector(running_sum);
+ }
+
+ const uint32_t average = Average(sum, block_width, block_height);
+ BlockSubtract(average, luma, block_width, block_height);
+}
+
+// Saturate |dc + ((alpha * luma) >> 6))| to uint8_t.
+inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
+ const int16x8_t dc) {
+ const int16x8_t la = vmulq_n_s16(luma, alpha);
+ // Subtract the sign bit to round towards zero.
+ const int16x8_t sub_sign = vsraq_n_s16(la, la, 15);
+ // Shift and accumulate.
+ const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6);
+ return vqmovun_s16(result);
+}
+
+// The range of luma/alpha is not really important because it gets saturated to
+// uint8_t. Saturated int16_t >> 6 outranges uint8_t.
+template <int block_height>
+inline void CflIntraPredictor4xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; y += 2) {
+ const int16x4_t luma_row0 = vld1_s16(luma[y]);
+ const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+ const uint8x8_t sum =
+ Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc);
+ StoreLo4(dst, sum);
+ dst += stride;
+ StoreHi4(dst, sum);
+ dst += stride;
+ }
+}
+
+template <int block_height>
+inline void CflIntraPredictor8xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row = vld1q_s16(luma[y]);
+ const uint8x8_t sum = Combine8(luma_row, alpha, dc);
+ vst1_u8(dst, sum);
+ dst += stride;
+ }
+}
+
+template <int block_height>
+inline void CflIntraPredictor16xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+ const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+ vst1_u8(dst, sum_0);
+ vst1_u8(dst + 8, sum_1);
+ dst += stride;
+ }
+}
+
+template <int block_height>
+inline void CflIntraPredictor32xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+ const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+ const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+ const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+ const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc);
+ const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc);
+ vst1_u8(dst, sum_0);
+ vst1_u8(dst + 8, sum_1);
+ vst1_u8(dst + 16, sum_2);
+ vst1_u8(dst + 24, sum_3);
+ dst += stride;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_NEON<4, 8>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_NEON<4, 16>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_NEON<8, 4>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_NEON<8, 8>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_NEON<8, 16>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_NEON<8, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_NEON<16, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_NEON<16, 8>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_NEON<16, 16>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_NEON<16, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_NEON<32, 8>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_NEON<32, 16>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_NEON<32, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_NEON<4, 8>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_NEON<4, 16>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_NEON<8, 4>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_NEON<8, 8>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_NEON<8, 16>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_NEON<8, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_NEON<16, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_NEON<16, 8>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_NEON<16, 16>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_NEON<16, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_NEON<32, 8>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_NEON<32, 16>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_NEON<32, 32>;
+
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+ dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor16xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor16xN_NEON<32>;
+
+ dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor32xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor32xN_NEON<32>;
+ // Max Cfl predictor size is 32x32.
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
new file mode 100644
index 0000000..805ba81
--- /dev/null
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -0,0 +1,926 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm> // std::min
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring> // memset
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Blend two values based on a 32 bit weight.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+ const uint8x8_t a_weight,
+ const uint8x8_t b_weight) {
+ const uint16x8_t a_product = vmull_u8(a, a_weight);
+ const uint16x8_t b_product = vmull_u8(b, b_weight);
+
+ return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+}
+
+// For vertical operations the weights are one constant value.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+ const uint8_t weight) {
+ return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight));
+}
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step,
+ const uint8x8_t right_step, uint8x8_t* left,
+ uint8x8_t* right) {
+ const uint8x16_t mixed = vld1q_u8(source);
+ *left = VQTbl1U8(mixed, left_step);
+ *right = VQTbl1U8(mixed, right_step);
+}
+
+// Handle signed step arguments by ignoring the sign. Negative values are
+// considered out of range and overwritten later.
+inline void LoadStepwise(const uint8_t* const source, const int8x8_t left_step,
+ const int8x8_t right_step, uint8x8_t* left,
+ uint8x8_t* right) {
+ LoadStepwise(source, vreinterpret_u8_s8(left_step),
+ vreinterpret_u8_s8(right_step), left, right);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
+ const int height, const uint8_t* const top,
+ const int xstep, const bool upsampled) {
+ assert(width == 4 || width == 8);
+
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (width + height - 1) << upsample_shift;
+ const int8x8_t max_base = vdup_n_s8(max_base_x);
+ const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+ const int8x8_t all = vcreate_s8(0x0706050403020100);
+ const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+ const int8x8_t base_step = upsampled ? even : all;
+ const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+
+ int top_x = xstep;
+ int y = 0;
+ do {
+ const int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+ return;
+ }
+
+ const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+ // Zone2 uses negative values for xstep. Use signed values to compare
+ // |top_base_x| to |max_base_x|.
+ const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+ const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+ // 4 wide subsamples the output. 8 wide subsamples the input.
+ if (width == 4) {
+ const uint8x8_t left_values = vld1_u8(top + top_base_x);
+ const uint8x8_t right_values = RightShift<8>(left_values);
+ const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+
+ // If |upsampled| is true then extract every other value for output.
+ const uint8x8_t value_stepped =
+ vtbl1_u8(value, vreinterpret_u8_s8(base_step));
+ const uint8x8_t masked_value =
+ vbsl_u8(max_base_mask, value_stepped, top_max_base);
+
+ StoreLo4(dst, masked_value);
+ } else /* width == 8 */ {
+ uint8x8_t left_values, right_values;
+ // WeightedBlend() steps up to Q registers. Downsample the input to avoid
+ // doing extra calculations.
+ LoadStepwise(top + top_base_x, base_step, right_step, &left_values,
+ &right_values);
+
+ const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+ const uint8x8_t masked_value =
+ vbsl_u8(max_base_mask, value, top_max_base);
+
+ vst1_u8(dst, masked_value);
+ }
+ dst += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint8_t* const top, const int xstep,
+ const bool upsampled) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (width + height - 1) << upsample_shift;
+ const int8x8_t max_base = vdup_n_s8(max_base_x);
+ const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+ const int8x8_t all = vcreate_s8(0x0706050403020100);
+ const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+ const int8x8_t base_step = upsampled ? even : all;
+ const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+ const int8x8_t block_step = vdup_n_s8(8 << upsample_shift);
+
+ int top_x = xstep;
+ int y = 0;
+ do {
+ const int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+ return;
+ }
+
+ const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+ // Zone2 uses negative values for xstep. Use signed values to compare
+ // |top_base_x| to |max_base_x|.
+ int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+ int x = 0;
+ do {
+ const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+ // Extract the input values based on |upsampled| here to avoid doing twice
+ // as many calculations.
+ uint8x8_t left_values, right_values;
+ LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values,
+ &right_values);
+
+ const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+ const uint8x8_t masked_value =
+ vbsl_u8(max_base_mask, value, top_max_base);
+
+ vst1_u8(dst + x, masked_value);
+
+ base_v = vadd_s8(base_v, block_step);
+ x += 8;
+ } while (x < width);
+ top_x += xstep;
+ dst += stride;
+ } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest,
+ const ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ assert(xstep > 0);
+
+ const int upsample_shift = static_cast<int>(upsampled_top);
+
+ const uint8x8_t all = vcreate_u8(0x0706050403020100);
+
+ if (xstep == 64) {
+ assert(!upsampled_top);
+ const uint8_t* top_ptr = top + 1;
+ int y = 0;
+ do {
+ memcpy(dst, top_ptr, width);
+ memcpy(dst + stride, top_ptr + 1, width);
+ memcpy(dst + 2 * stride, top_ptr + 2, width);
+ memcpy(dst + 3 * stride, top_ptr + 3, width);
+ dst += 4 * stride;
+ top_ptr += 4;
+ y += 4;
+ } while (y < height);
+ } else if (width == 4) {
+ DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top);
+ } else if (xstep > 51) {
+ // 7.11.2.10. Intra edge upsample selection process
+ // if ( d <= 0 || d >= 40 ) useUpsample = 0
+ // For |upsample_top| the delta is from vertical so |prediction_angle - 90|.
+ // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet
+ // this criteria. The |xstep| value for angle 51 happens to be 51 as well.
+ // Shallower angles have greater xstep values.
+ assert(!upsampled_top);
+ const int max_base_x = ((width + height) - 1);
+ const uint8x8_t max_base = vdup_n_u8(max_base_x);
+ const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+ const uint8x8_t block_step = vdup_n_u8(8);
+
+ int top_x = xstep;
+ int y = 0;
+ do {
+ const int top_base_x = top_x >> 6;
+ const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+ uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all);
+ int x = 0;
+ // Only calculate a block of 8 when at least one of the output values is
+ // within range. Otherwise it can read off the end of |top|.
+ const int must_calculate_width =
+ std::min(width, max_base_x - top_base_x + 7) & ~7;
+ for (; x < must_calculate_width; x += 8) {
+ const uint8x8_t max_base_mask = vclt_u8(base_v, max_base);
+
+ // Since these |xstep| values can not be upsampled the load is
+ // simplified.
+ const uint8x8_t left_values = vld1_u8(top + top_base_x + x);
+ const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1);
+ const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+ const uint8x8_t masked_value =
+ vbsl_u8(max_base_mask, value, top_max_base);
+
+ vst1_u8(dst + x, masked_value);
+ base_v = vadd_u8(base_v, block_step);
+ }
+ memset(dst + x, top[max_base_x], width - x);
+ dst += stride;
+ top_x += xstep;
+ } while (++y < height);
+ } else {
+ DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top);
+ }
+}
+
+// Process 4 or 8 |width| by 4 or 8 |height|.
+template <int width>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+ const int height,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep,
+ const int upsample_shift) {
+ assert(width == 4 || width == 8);
+ assert(height == 4 || height == 8);
+ const int scale_bits = 6 - upsample_shift;
+
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> scale_bits) +
+ (/* base_step */ 1 << upsample_shift) *
+ (height - 1)); // left_base_y
+
+ // Limited improvement for 8x8. ~20% faster for 64x64.
+ const uint8x8_t all = vcreate_u8(0x0706050403020100);
+ const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+ const uint8x8_t base_step = upsample_shift ? even : all;
+ const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+ uint8_t* dst = dest;
+ uint8x8_t left_v[8], right_v[8], value_v[8];
+ const uint8_t* const left = left_column;
+
+ const int index_0 = base_left_y;
+ LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step,
+ &left_v[0], &right_v[0]);
+ value_v[0] = WeightedBlend(left_v[0], right_v[0],
+ ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_1 = base_left_y + ystep;
+ LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step,
+ &left_v[1], &right_v[1]);
+ value_v[1] = WeightedBlend(left_v[1], right_v[1],
+ ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_2 = base_left_y + ystep * 2;
+ LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step,
+ &left_v[2], &right_v[2]);
+ value_v[2] = WeightedBlend(left_v[2], right_v[2],
+ ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_3 = base_left_y + ystep * 3;
+ LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step,
+ &left_v[3], &right_v[3]);
+ value_v[3] = WeightedBlend(left_v[3], right_v[3],
+ ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_4 = base_left_y + ystep * 4;
+ LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step,
+ &left_v[4], &right_v[4]);
+ value_v[4] = WeightedBlend(left_v[4], right_v[4],
+ ((index_4 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_5 = base_left_y + ystep * 5;
+ LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step,
+ &left_v[5], &right_v[5]);
+ value_v[5] = WeightedBlend(left_v[5], right_v[5],
+ ((index_5 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_6 = base_left_y + ystep * 6;
+ LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step,
+ &left_v[6], &right_v[6]);
+ value_v[6] = WeightedBlend(left_v[6], right_v[6],
+ ((index_6 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_7 = base_left_y + ystep * 7;
+ LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step,
+ &left_v[7], &right_v[7]);
+ value_v[7] = WeightedBlend(left_v[7], right_v[7],
+ ((index_7 << upsample_shift) & 0x3F) >> 1);
+
+ // 8x8 transpose.
+ const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]),
+ vcombine_u8(value_v[1], value_v[5]));
+ const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]),
+ vcombine_u8(value_v[3], value_v[7]));
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ if (width == 4) {
+ StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+ if (height == 4) return;
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+ } else {
+ vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+ if (height == 4) return;
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+ }
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst,
+ const ptrdiff_t stride,
+ const int height,
+ const uint8_t* const left_column,
+ const int16x8_t left_y,
+ const int upsample_shift) {
+ assert(width == 4 || width == 8);
+
+ // The shift argument must be a constant.
+ int16x8_t offset_y, shift_upsampled = left_y;
+ if (upsample_shift) {
+ offset_y = vshrq_n_s16(left_y, 5);
+ shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+ } else {
+ offset_y = vshrq_n_s16(left_y, 6);
+ }
+
+ // Select values to the left of the starting point.
+ // The 15th element (and 16th) will be all the way at the end, to the right.
+ // With a negative ystep everything else will be "left" of them.
+ // This supports cumulative steps up to 15. We could support up to 16 by doing
+ // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+ // registers as input which would allow for cumulative offsets of 32.
+ const int16x8_t sampler =
+ vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset));
+ const uint8x8_t left_values = vqmovun_s16(sampler);
+ const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1));
+
+ const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+ const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1));
+ const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul);
+
+ int y = 0;
+ do {
+ uint8x8_t src_left, src_right;
+ LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift),
+ left_values, right_values, &src_left, &src_right);
+ const uint8x8_t val =
+ WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul);
+
+ if (width == 4) {
+ StoreLo4(dst, val);
+ } else {
+ vst1_u8(dst, val);
+ }
+ dst += stride;
+ } while (++y < height);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1Blend_WxH(uint8_t* dest, const ptrdiff_t stride,
+ const int height,
+ const uint8_t* const top_row,
+ int zone_bounds, int top_x,
+ const int xstep,
+ const int upsample_shift) {
+ assert(width == 4 || width == 8);
+
+ const int scale_bits_x = 6 - upsample_shift;
+
+ const uint8x8_t all = vcreate_u8(0x0706050403020100);
+ const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+ const uint8x8_t base_step = upsample_shift ? even : all;
+ const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+ int y = 0;
+ do {
+ const uint8_t* const src = top_row + (top_x >> scale_bits_x);
+ uint8x8_t left, right;
+ LoadStepwise(src, base_step, right_step, &left, &right);
+
+ const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1;
+ const uint8x8_t val = WeightedBlend(left, right, shift);
+
+ uint8x8_t dst_blend = vld1_u8(dest);
+ // |zone_bounds| values can be negative.
+ uint8x8_t blend =
+ vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6)));
+ uint8x8_t output = vbsl_u8(blend, val, dst_blend);
+
+ if (width == 4) {
+ StoreLo4(dest, output);
+ } else {
+ vst1_u8(dest, output);
+ }
+ dest += stride;
+ zone_bounds += xstep;
+ top_x -= xstep;
+ } while (++y < height);
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+ 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in sections
+// that permit it.
+inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int height, const int xstep,
+ const int ystep, const bool upsampled_top,
+ const bool upsampled_left) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Helper vector.
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If
+ // it's only 4, it will be finished in the first iteration.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+ // at least 3.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
+
+ // For steep angles, the source pixels from |left_column| may not fit in a
+ // 16-byte load for shuffling.
+ // TODO(petersonab): Find a more precise formula for this subject to x.
+ // TODO(johannkoenig): Revisit this for |width| == 4.
+ const int max_shuffle_height =
+ std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. The following values need the full ystep as a relative offset.
+ int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
+ left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ if (min_top_only_x > 0) {
+ // Round down to the nearest multiple of 8.
+ // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should.
+ const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+ DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
+ upsampled_top);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min((4 << 6) / xstep, height);
+ // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+ // high. This means that max_shuffle_height is unbounded and xstep_bounds
+ // will overflow in 16 bits. This is prevented by stopping the first
+ // blending loop at min_left_only_y for such cases, which means we skip over
+ // the second blending loop as well.
+ const int left_shuffle_stop_y =
+ std::min(max_shuffle_height, min_left_only_y);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ // +8 increment is OK because if height is 4 this only goes once.
+ for (; y < left_shuffle_stop_y;
+ y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_WxH<4>(
+ dst, stride, min_height,
+ left_column + ((y - left_base_increment) << upsample_left_shift),
+ left_y, upsample_left_shift);
+
+ DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
+ xstep_bounds, top_x, xstep,
+ upsample_top_shift);
+ }
+
+ // Pick up from the last y-value, using the slower but secure method for
+ // left prediction.
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+ for (; y < min_left_only_y;
+ y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone3_WxH<4>(
+ dst, stride, min_height,
+ left_column + ((y - left_base_increment) << upsample_left_shift),
+ base_left_y, -ystep, upsample_left_shift);
+
+ DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
+ xstep_bounds, top_x, xstep,
+ upsample_top_shift);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst += stride8) {
+ DirectionalZone3_WxH<4>(
+ dst, stride, min_height,
+ left_column + ((y - left_base_increment) << upsample_left_shift),
+ base_left_y, -ystep, upsample_left_shift);
+ }
+ } else {
+ DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep,
+ upsampled_top);
+ }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Helper vector.
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+ const int ystep8 = ystep << 3;
+
+ // Process Wx4 blocks.
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+ // at least 3.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ // For steep angles, the source pixels from |left_column| may not fit in a
+ // 16-byte load for shuffling.
+ // TODO(petersonab): Find a more precise formula for this subject to x.
+ const int max_shuffle_height =
+ std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
+ left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ int x = 0;
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep,
+ upsampled_top);
+
+ if (max_top_only_y == height) continue;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+ // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+ // high. This means that max_shuffle_height is unbounded and xstep_bounds
+ // will overflow in 16 bits. This is prevented by stopping the first
+ // blending loop at min_left_only_y for such cases, which means we skip over
+ // the second blending loop as well.
+ const int left_shuffle_stop_y =
+ std::min(max_shuffle_height, min_left_only_y);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ for (; y < left_shuffle_stop_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsample_left_shift);
+
+ DirectionalZone1Blend_WxH<8>(
+ dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+ xstep_bounds, top_x, xstep, upsample_top_shift);
+ }
+
+ // Pick up from the last y-value, using the slower but secure method for
+ // left prediction.
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+
+ DirectionalZone1Blend_WxH<8>(
+ dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+ xstep_bounds, top_x, xstep, upsample_top_shift);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+ }
+ // TODO(johannkoenig): May be able to remove this branch.
+ if (x < width) {
+ DirectionalZone1_WxH(dst + x, stride, width - x, height,
+ top_row + (x << upsample_top_shift), -xstep,
+ upsampled_top);
+ }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+ void* const dest, const ptrdiff_t stride, const void* const top_row,
+ const void* const left_column, const int width, const int height,
+ const int xstep, const int ystep, const bool upsampled_top,
+ const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint8_t top_buffer[288];
+ uint8_t left_buffer[288];
+ memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+ const uint8_t* top_ptr = top_buffer + 144;
+ const uint8_t* left_ptr = left_buffer + 144;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ if (width == 4) {
+ DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
+ upsampled_top, upsampled_left);
+ } else {
+ DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep,
+ ystep, upsampled_top, upsampled_left);
+ }
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+ const ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled_left) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+
+ assert(ystep > 0);
+
+ const int upsample_shift = static_cast<int>(upsampled_left);
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+
+ if (width == 4 || height == 4) {
+ // This block can handle all sizes but the specializations for other sizes
+ // are faster.
+ const uint8x8_t all = vcreate_u8(0x0706050403020100);
+ const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+ const uint8x8_t base_step_v = upsampled_left ? even : all;
+ const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1));
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+ dst += y * stride + x;
+ uint8x8_t left_v[4], right_v[4], value_v[4];
+ const int ystep_base = ystep * x;
+ const int offset = y * base_step;
+
+ const int index_0 = ystep_base + ystep * 1;
+ LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v,
+ right_step, &left_v[0], &right_v[0]);
+ value_v[0] = WeightedBlend(left_v[0], right_v[0],
+ ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_1 = ystep_base + ystep * 2;
+ LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v,
+ right_step, &left_v[1], &right_v[1]);
+ value_v[1] = WeightedBlend(left_v[1], right_v[1],
+ ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_2 = ystep_base + ystep * 3;
+ LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v,
+ right_step, &left_v[2], &right_v[2]);
+ value_v[2] = WeightedBlend(left_v[2], right_v[2],
+ ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_3 = ystep_base + ystep * 4;
+ LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v,
+ right_step, &left_v[3], &right_v[3]);
+ value_v[3] = WeightedBlend(left_v[3], right_v[3],
+ ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+ // 8x4 transpose.
+ const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]);
+ const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]);
+
+ const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]),
+ vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]),
+ vreinterpret_u16_u8(b1.val[1]));
+
+ StoreLo4(dst, vreinterpret_u8_u16(c0.val[0]));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u16(c1.val[0]));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u16(c0.val[1]));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u16(c1.val[1]));
+
+ if (height > 4) {
+ dst += stride;
+ StoreHi4(dst, vreinterpret_u8_u16(c0.val[0]));
+ dst += stride;
+ StoreHi4(dst, vreinterpret_u8_u16(c1.val[0]));
+ dst += stride;
+ StoreHi4(dst, vreinterpret_u8_u16(c0.val[1]));
+ dst += stride;
+ StoreHi4(dst, vreinterpret_u8_u16(c1.val[1]));
+ }
+ x += 4;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+ } else { // 8x8 at a time.
+ // Limited improvement for 8x8. ~20% faster for 64x64.
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+ dst += y * stride + x;
+ const int ystep_base = ystep * (x + 1);
+
+ DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift),
+ ystep_base, ystep, upsample_shift);
+ x += 8;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+ dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
+ dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_intra_neon.cc
new file mode 100644
index 0000000..411708e
--- /dev/null
+++ b/src/dsp/arm/intrapred_filter_intra_neon.cc
@@ -0,0 +1,176 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+// Transpose kFilterIntraTaps and convert the first row to unsigned values.
+//
+// With the previous orientation we were able to multiply all the input values
+// by a single tap. This required that all the input values be in one vector
+// which requires expensive set up operations (shifts, vext, vtbl). All the
+// elements of the result needed to be summed (easy on A64 - vaddvq_s16) but
+// then the shifting, rounding, and clamping was done in GP registers.
+//
+// Switching to unsigned values allows multiplying the 8 bit inputs directly.
+// When one value was negative we needed to vmovl_u8 first so that the results
+// maintained the proper sign.
+//
+// We take this into account when summing the values by subtracting the product
+// of the first row.
+alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] =
+ {{{6, 5, 3, 3, 4, 3, 3, 3}, // Original values are negative.
+ {10, 2, 1, 1, 6, 2, 2, 1},
+ {0, 10, 1, 1, 0, 6, 2, 2},
+ {0, 0, 10, 2, 0, 0, 6, 2},
+ {0, 0, 0, 10, 0, 0, 0, 6},
+ {12, 9, 7, 5, 2, 2, 2, 3},
+ {0, 0, 0, 0, 12, 9, 7, 5}},
+ {{10, 6, 4, 2, 10, 6, 4, 2}, // Original values are negative.
+ {16, 0, 0, 0, 16, 0, 0, 0},
+ {0, 16, 0, 0, 0, 16, 0, 0},
+ {0, 0, 16, 0, 0, 0, 16, 0},
+ {0, 0, 0, 16, 0, 0, 0, 16},
+ {10, 6, 4, 2, 0, 0, 0, 0},
+ {0, 0, 0, 0, 10, 6, 4, 2}},
+ {{8, 8, 8, 8, 4, 4, 4, 4}, // Original values are negative.
+ {8, 0, 0, 0, 4, 0, 0, 0},
+ {0, 8, 0, 0, 0, 4, 0, 0},
+ {0, 0, 8, 0, 0, 0, 4, 0},
+ {0, 0, 0, 8, 0, 0, 0, 4},
+ {16, 16, 16, 16, 0, 0, 0, 0},
+ {0, 0, 0, 0, 16, 16, 16, 16}},
+ {{2, 1, 1, 0, 1, 1, 1, 1}, // Original values are negative.
+ {8, 3, 2, 1, 4, 3, 2, 2},
+ {0, 8, 3, 2, 0, 4, 3, 2},
+ {0, 0, 8, 3, 0, 0, 4, 3},
+ {0, 0, 0, 8, 0, 0, 0, 4},
+ {10, 6, 4, 2, 3, 4, 4, 3},
+ {0, 0, 0, 0, 10, 6, 4, 3}},
+ {{12, 10, 9, 8, 10, 9, 8, 7}, // Original values are negative.
+ {14, 0, 0, 0, 12, 1, 0, 0},
+ {0, 14, 0, 0, 0, 12, 0, 0},
+ {0, 0, 14, 0, 0, 0, 12, 1},
+ {0, 0, 0, 14, 0, 0, 0, 12},
+ {14, 12, 11, 10, 0, 0, 1, 1},
+ {0, 0, 0, 0, 14, 12, 11, 9}}};
+
+void FilterIntraPredictor_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ FilterIntraPredictor pred, int width,
+ int height) {
+ const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+
+ assert(width <= 32 && height <= 32);
+
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ uint8x8_t transposed_taps[7];
+ for (int i = 0; i < 7; ++i) {
+ transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]);
+ }
+
+ uint8_t relative_top_left = top[-1];
+ const uint8_t* relative_top = top;
+ uint8_t relative_left[2] = {left[0], left[1]};
+
+ int y = 0;
+ do {
+ uint8_t* row_dst = dst;
+ int x = 0;
+ do {
+ uint16x8_t sum = vdupq_n_u16(0);
+ const uint16x8_t subtrahend =
+ vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left));
+ for (int i = 1; i < 5; ++i) {
+ sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1]));
+ }
+ for (int i = 5; i < 7; ++i) {
+ sum =
+ vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5]));
+ }
+
+ const int16x8_t sum_signed =
+ vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend));
+ const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4);
+
+ uint8x8_t sum_saturated = vqmovun_s16(sum_shifted);
+
+ StoreLo4(row_dst, sum_saturated);
+ StoreHi4(row_dst + stride, sum_saturated);
+
+ // Progress across
+ relative_top_left = relative_top[3];
+ relative_top += 4;
+ relative_left[0] = row_dst[3];
+ relative_left[1] = row_dst[3 + stride];
+ row_dst += 4;
+ x += 4;
+ } while (x < width);
+
+ // Progress down.
+ relative_top_left = left[y + 1];
+ relative_top = dst + stride;
+ relative_left[0] = left[y + 2];
+ relative_left[1] = left[y + 3];
+
+ dst += 2 * stride;
+ y += 2;
+ } while (y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->filter_intra_predictor = FilterIntraPredictor_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterIntraInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
new file mode 100644
index 0000000..c967d82
--- /dev/null
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -0,0 +1,1144 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_NEON
+
+using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2,
+ const bool use_ref_1, const void* ref_1,
+ const int ref_1_size_log2);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc);
+
+// DC intra-predictors for square blocks.
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+ DcStoreFunc storefn>
+struct DcPredFuncs_NEON {
+ DcPredFuncs_NEON() = delete;
+
+ static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+ DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
+ storefn>::DcTop(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* /*left_column*/) {
+ const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
+ const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+ DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
+ storefn>::DcLeft(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const uint32x2_t sum =
+ sumfn(left_column, block_height_log2, false, nullptr, 0);
+ const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+ DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const uint32x2_t sum =
+ sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
+ if (block_width_log2 == block_height_log2) {
+ const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1);
+ storefn(dest, stride, dc);
+ } else {
+ // TODO(johannkoenig): Compare this to mul/shift in vectors.
+ const int divisor = (1 << block_width_log2) + (1 << block_height_log2);
+ uint32_t dc = vget_lane_u32(sum, 0);
+ dc += divisor >> 1;
+ dc /= divisor;
+ storefn(dest, stride, vdup_n_u32(dc));
+ }
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x4_t val) {
+ const uint32x2_t sum = vpaddl_u16(val);
+ return vpadd_u32(sum, sum);
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x8_t val) {
+ const uint32x4_t sum_0 = vpaddlq_u16(val);
+ const uint64x2_t sum_1 = vpaddlq_u32(sum_0);
+ return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)),
+ vget_high_u32(vreinterpretq_u32_u64(sum_1)));
+}
+
+} // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// Add and expand the elements in the |val_[01]| to uint16_t but do not sum the
+// entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) {
+ const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+ const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+ return vaddq_u16(sum_0, sum_1);
+}
+
+// Add and expand the elements in the |val_[0123]| to uint16_t but do not sum
+// the entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1,
+ const uint8x16_t val_2, const uint8x16_t val_3) {
+ const uint16x8_t sum_0 = Add(val_0, val_1);
+ const uint16x8_t sum_1 = Add(val_2, val_3);
+ return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 32 uint8_t values.
+inline uint16x8_t LoadAndAdd32(const uint8_t* buf) {
+ const uint8x16_t val_0 = vld1q_u8(buf);
+ const uint8x16_t val_1 = vld1q_u8(buf + 16);
+ return Add(val_0, val_1);
+}
+
+// Load and combine 64 uint8_t values.
+inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
+ const uint8x16_t val_0 = vld1q_u8(buf);
+ const uint8x16_t val_1 = vld1q_u8(buf + 16);
+ const uint8x16_t val_2 = vld1q_u8(buf + 32);
+ const uint8x16_t val_3 = vld1q_u8(buf + 48);
+ return Add(val_0, val_1, val_2, val_3);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+// For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
+// uint32_t.
+inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
+ const bool use_ref_1, const void* ref_1,
+ const int ref_1_size_log2) {
+ const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
+ const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
+ if (ref_0_size_log2 == 2) {
+ uint8x8_t val = Load4(ref_0_u8);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 2) { // 4x4
+ val = Load4<1>(ref_1_u8, val);
+ return Sum(vpaddl_u8(val));
+ } else if (ref_1_size_log2 == 3) { // 4x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 4) { // 4x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
+ }
+ }
+ // 4x1
+ const uint16x4_t sum = vpaddl_u8(val);
+ return vpaddl_u16(sum);
+ } else if (ref_0_size_log2 == 3) {
+ const uint8x8_t val_0 = vld1_u8(ref_0_u8);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 2) { // 8x4
+ const uint8x8_t val_1 = Load4(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val_0);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 3) { // 8x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val_0);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 4) { // 8x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
+ } else if (ref_1_size_log2 == 5) { // 8x32
+ return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
+ }
+ }
+ // 8x1
+ return Sum(vpaddl_u8(val_0));
+ } else if (ref_0_size_log2 == 4) {
+ const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 2) { // 16x4
+ const uint8x8_t val_1 = Load4(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+ } else if (ref_1_size_log2 == 3) { // 16x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+ } else if (ref_1_size_log2 == 4) { // 16x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(Add(val_0, val_1));
+ } else if (ref_1_size_log2 == 5) { // 16x32
+ const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 6) { // 16x64
+ const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ // 16x1
+ return Sum(vpaddlq_u8(val_0));
+ } else if (ref_0_size_log2 == 5) {
+ const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 3) { // 32x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ return Sum(vaddw_u8(sum_0, val_1));
+ } else if (ref_1_size_log2 == 4) { // 32x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 5) { // 32x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 6) { // 32x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ // 32x1
+ return Sum(sum_0);
+ }
+
+ assert(ref_0_size_log2 == 6);
+ const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 4) { // 64x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 5) { // 64x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 6) { // 64x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ // 64x1
+ return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+ const uint32x2_t dc) {
+ const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0);
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (width == 4) {
+ int i = height - 1;
+ do {
+ StoreLo4(dst, vget_low_u8(dc_dup));
+ dst += stride;
+ } while (--i != 0);
+ StoreLo4(dst, vget_low_u8(dc_dup));
+ } else if (width == 8) {
+ int i = height - 1;
+ do {
+ vst1_u8(dst, vget_low_u8(dc_dup));
+ dst += stride;
+ } while (--i != 0);
+ vst1_u8(dst, vget_low_u8(dc_dup));
+ } else if (width == 16) {
+ int i = height - 1;
+ do {
+ vst1q_u8(dst, dc_dup);
+ dst += stride;
+ } while (--i != 0);
+ vst1q_u8(dst, dc_dup);
+ } else if (width == 32) {
+ int i = height - 1;
+ do {
+ vst1q_u8(dst, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
+ dst += stride;
+ } while (--i != 0);
+ vst1q_u8(dst, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
+ } else {
+ assert(width == 64);
+ int i = height - 1;
+ do {
+ vst1q_u8(dst, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
+ vst1q_u8(dst + 32, dc_dup);
+ vst1q_u8(dst + 48, dc_dup);
+ dst += stride;
+ } while (--i != 0);
+ vst1q_u8(dst, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
+ vst1q_u8(dst + 32, dc_dup);
+ vst1q_u8(dst + 48, dc_dup);
+ }
+}
+
+template <int width, int height>
+inline void Paeth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ auto* dest_u8 = static_cast<uint8_t*>(dest);
+ const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+ const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+ const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+ uint8x8_t top;
+ if (width == 4) {
+ top = Load4(top_row_u8);
+ } else { // width == 8
+ top = vld1_u8(top_row_u8);
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left = vdup_n_u8(left_col_u8[y]);
+
+ const uint8x8_t left_dist = vabd_u8(top, top_left);
+ const uint8x8_t top_dist = vabd_u8(left, top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+ const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+ const uint8x8_t left_le_top_left =
+ vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+ const uint8x8_t top_le_top_left =
+ vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint8x8_t result = vbsl_u8(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbsl_u8(left_or_top_mask, result, top_left);
+
+ if (width == 4) {
+ StoreLo4(dest_u8, result);
+ } else { // width == 8
+ vst1_u8(dest_u8, result);
+ }
+ dest_u8 += stride;
+ }
+}
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
+ const uint16x8_t top_left_dist_low,
+ const uint16x8_t top_left_dist_high) {
+ // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of
+ // using movl(x_dist).
+ const uint8x8_t x_le_top_left_low =
+ vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low));
+ const uint8x8_t x_le_top_left_high =
+ vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high));
+ return vcombine_u8(x_le_top_left_low, x_le_top_left_high);
+}
+
+// Select the closest values and collect them.
+inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
+ const uint8x16_t top_left,
+ const uint8x16_t left_le_top,
+ const uint8x16_t left_le_top_left,
+ const uint8x16_t top_le_top_left) {
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint8x16_t result = vbslq_u8(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num) \
+ const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \
+ vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+ const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \
+ vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num) \
+ const uint8x16_t left_le_top_left_##num = \
+ XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \
+ top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num) \
+ const uint8x16_t top_le_top_left_##num = XLeTopLeft( \
+ top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+template <int width, int height>
+inline void Paeth16PlusxN_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ auto* dest_u8 = static_cast<uint8_t*>(dest);
+ const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+ const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+ const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+ uint8x16_t top[4];
+ top[0] = vld1q_u8(top_row_u8);
+ if (width > 16) {
+ top[1] = vld1q_u8(top_row_u8 + 16);
+ if (width == 64) {
+ top[2] = vld1q_u8(top_row_u8 + 32);
+ top[3] = vld1q_u8(top_row_u8 + 48);
+ }
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x16_t left = vdupq_n_u8(left_col_u8[y]);
+
+ const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+ const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+ TOP_LEFT_DIST(0);
+ const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+ LEFT_LE_TOP_LEFT(0);
+ TOP_LE_TOP_LEFT(0);
+
+ const uint8x16_t result_0 =
+ SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+ top_le_top_left_0);
+ vst1q_u8(dest_u8, result_0);
+
+ if (width > 16) {
+ const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+ TOP_LEFT_DIST(1);
+ const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+ LEFT_LE_TOP_LEFT(1);
+ TOP_LE_TOP_LEFT(1);
+
+ const uint8x16_t result_1 =
+ SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1,
+ top_le_top_left_1);
+ vst1q_u8(dest_u8 + 16, result_1);
+
+ if (width == 64) {
+ const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+ TOP_LEFT_DIST(2);
+ const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+ LEFT_LE_TOP_LEFT(2);
+ TOP_LE_TOP_LEFT(2);
+
+ const uint8x16_t result_2 =
+ SelectPaeth(top[2], left, top_left, left_2_le_top,
+ left_le_top_left_2, top_le_top_left_2);
+ vst1q_u8(dest_u8 + 32, result_2);
+
+ const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+ TOP_LEFT_DIST(3);
+ const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+ LEFT_LE_TOP_LEFT(3);
+ TOP_LE_TOP_LEFT(3);
+
+ const uint8x16_t result_3 =
+ SelectPaeth(top[3], left, top_left, left_3_le_top,
+ left_le_top_left_3, top_le_top_left_3);
+ vst1q_u8(dest_u8 + 48, result_3);
+ }
+ }
+
+ dest_u8 += stride;
+ }
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+ using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+ using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+ using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+ using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+ using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+ using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+ using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+ using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+ using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+ using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+ using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+ using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+ using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+ using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+ using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+ using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+ using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+ using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ // 4x4
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<4, 4>;
+
+ // 4x8
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DcDefs::_4x8::DcTop;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DcDefs::_4x8::DcLeft;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DcDefs::_4x8::Dc;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<4, 8>;
+
+ // 4x16
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DcDefs::_4x16::DcTop;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DcDefs::_4x16::DcLeft;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DcDefs::_4x16::Dc;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<4, 16>;
+
+ // 8x4
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DcDefs::_8x4::DcTop;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DcDefs::_8x4::DcLeft;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DcDefs::_8x4::Dc;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<8, 4>;
+
+ // 8x8
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DcDefs::_8x8::DcTop;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DcDefs::_8x8::DcLeft;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DcDefs::_8x8::Dc;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<8, 8>;
+
+ // 8x16
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DcDefs::_8x16::DcTop;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DcDefs::_8x16::DcLeft;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DcDefs::_8x16::Dc;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<8, 16>;
+
+ // 8x32
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DcDefs::_8x32::DcTop;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DcDefs::_8x32::DcLeft;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DcDefs::_8x32::Dc;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<8, 32>;
+
+ // 16x4
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DcDefs::_16x4::DcTop;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DcDefs::_16x4::DcLeft;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DcDefs::_16x4::Dc;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 4>;
+
+ // 16x8
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DcDefs::_16x8::DcTop;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DcDefs::_16x8::DcLeft;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DcDefs::_16x8::Dc;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 8>;
+
+ // 16x16
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DcDefs::_16x16::DcTop;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DcDefs::_16x16::DcLeft;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DcDefs::_16x16::Dc;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 16>;
+
+ // 16x32
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DcDefs::_16x32::DcTop;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DcDefs::_16x32::DcLeft;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DcDefs::_16x32::Dc;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 32>;
+
+ // 16x64
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DcDefs::_16x64::DcTop;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DcDefs::_16x64::DcLeft;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DcDefs::_16x64::Dc;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 64>;
+
+ // 32x8
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DcDefs::_32x8::DcTop;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DcDefs::_32x8::DcLeft;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DcDefs::_32x8::Dc;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<32, 8>;
+
+ // 32x16
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DcDefs::_32x16::DcTop;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DcDefs::_32x16::DcLeft;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DcDefs::_32x16::Dc;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<32, 16>;
+
+ // 32x32
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DcDefs::_32x32::DcTop;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DcDefs::_32x32::DcLeft;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DcDefs::_32x32::Dc;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<32, 32>;
+
+ // 32x64
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DcDefs::_32x64::DcTop;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DcDefs::_32x64::DcLeft;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DcDefs::_32x64::Dc;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<32, 64>;
+
+ // 64x16
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DcDefs::_64x16::DcTop;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DcDefs::_64x16::DcLeft;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DcDefs::_64x16::Dc;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<64, 16>;
+
+ // 64x32
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DcDefs::_64x32::DcTop;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DcDefs::_64x32::DcLeft;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DcDefs::_64x32::Dc;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<64, 32>;
+
+ // 64x64
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DcDefs::_64x64::DcTop;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DcDefs::_64x64::DcLeft;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DcDefs::_64x64::Dc;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<64, 64>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Add the elements in the given vectors together but do not sum the entire
+// vector.
+inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1,
+ const uint16x8_t val_2, const uint16x8_t val_3) {
+ const uint16x8_t sum_0 = vaddq_u16(val_0, val_1);
+ const uint16x8_t sum_1 = vaddq_u16(val_2, val_3);
+ return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 16 uint16_t values.
+inline uint16x8_t LoadAndAdd16(const uint16_t* buf) {
+ const uint16x8_t val_0 = vld1q_u16(buf);
+ const uint16x8_t val_1 = vld1q_u16(buf + 8);
+ return vaddq_u16(val_0, val_1);
+}
+
+// Load and combine 32 uint16_t values.
+inline uint16x8_t LoadAndAdd32(const uint16_t* buf) {
+ const uint16x8_t val_0 = vld1q_u16(buf);
+ const uint16x8_t val_1 = vld1q_u16(buf + 8);
+ const uint16x8_t val_2 = vld1q_u16(buf + 16);
+ const uint16x8_t val_3 = vld1q_u16(buf + 24);
+ return Add(val_0, val_1, val_2, val_3);
+}
+
+// Load and combine 64 uint16_t values.
+inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
+ const uint16x8_t val_0 = vld1q_u16(buf);
+ const uint16x8_t val_1 = vld1q_u16(buf + 8);
+ const uint16x8_t val_2 = vld1q_u16(buf + 16);
+ const uint16x8_t val_3 = vld1q_u16(buf + 24);
+ const uint16x8_t val_4 = vld1q_u16(buf + 32);
+ const uint16x8_t val_5 = vld1q_u16(buf + 40);
+ const uint16x8_t val_6 = vld1q_u16(buf + 48);
+ const uint16x8_t val_7 = vld1q_u16(buf + 56);
+ const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3);
+ const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7);
+ return vaddq_u16(sum_0, sum_1);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
+ const bool use_ref_1, const void* ref_1,
+ const int ref_1_size_log2) {
+ const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
+ const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
+ if (ref_0_size_log2 == 2) {
+ const uint16x4_t val_0 = vld1_u16(ref_0_u16);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 2) { // 4x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ return Sum(vadd_u16(val_0, val_1));
+ } else if (ref_1_size_log2 == 3) { // 4x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+ return Sum(vaddq_u16(sum_0, val_1));
+ } else if (ref_1_size_log2 == 4) { // 4x16
+ const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ // 4x1
+ return Sum(val_0);
+ } else if (ref_0_size_log2 == 3) {
+ const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 2) { // 8x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+ return Sum(vaddq_u16(val_0, sum_1));
+ } else if (ref_1_size_log2 == 3) { // 8x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(val_0, val_1));
+ } else if (ref_1_size_log2 == 4) { // 8x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(val_0, sum_1));
+ } else if (ref_1_size_log2 == 5) { // 8x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(val_0, sum_1));
+ }
+ }
+ // 8x1
+ return Sum(val_0);
+ } else if (ref_0_size_log2 == 4) {
+ const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 2) { // 16x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 3) { // 16x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, val_1));
+ } else if (ref_1_size_log2 == 4) { // 16x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 5) { // 16x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 6) { // 16x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ // 16x1
+ return Sum(sum_0);
+ } else if (ref_0_size_log2 == 5) {
+ const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 3) { // 32x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, val_1));
+ } else if (ref_1_size_log2 == 4) { // 32x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 5) { // 32x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 6) { // 32x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ // 32x1
+ return Sum(sum_0);
+ }
+
+ assert(ref_0_size_log2 == 6);
+ const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
+ if (use_ref_1) {
+ if (ref_1_size_log2 == 4) { // 64x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 5) { // 64x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ } else if (ref_1_size_log2 == 6) { // 64x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ // 64x1
+ return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+ const uint32x2_t dc) {
+ auto* dest_u16 = static_cast<uint16_t*>(dest);
+ ptrdiff_t stride_u16 = stride >> 1;
+ const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0);
+ if (width == 4) {
+ int i = height - 1;
+ do {
+ vst1_u16(dest_u16, vget_low_u16(dc_dup));
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1_u16(dest_u16, vget_low_u16(dc_dup));
+ } else if (width == 8) {
+ int i = height - 1;
+ do {
+ vst1q_u16(dest_u16, dc_dup);
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1q_u16(dest_u16, dc_dup);
+ } else if (width == 16) {
+ int i = height - 1;
+ do {
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ } else if (width == 32) {
+ int i = height - 1;
+ do {
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ vst1q_u16(dest_u16 + 16, dc_dup);
+ vst1q_u16(dest_u16 + 24, dc_dup);
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ vst1q_u16(dest_u16 + 16, dc_dup);
+ vst1q_u16(dest_u16 + 24, dc_dup);
+ } else {
+ assert(width == 64);
+ int i = height - 1;
+ do {
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ vst1q_u16(dest_u16 + 16, dc_dup);
+ vst1q_u16(dest_u16 + 24, dc_dup);
+ vst1q_u16(dest_u16 + 32, dc_dup);
+ vst1q_u16(dest_u16 + 40, dc_dup);
+ vst1q_u16(dest_u16 + 48, dc_dup);
+ vst1q_u16(dest_u16 + 56, dc_dup);
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ vst1q_u16(dest_u16 + 16, dc_dup);
+ vst1q_u16(dest_u16 + 24, dc_dup);
+ vst1q_u16(dest_u16 + 32, dc_dup);
+ vst1q_u16(dest_u16 + 40, dc_dup);
+ vst1q_u16(dest_u16 + 48, dc_dup);
+ vst1q_u16(dest_u16 + 56, dc_dup);
+ }
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+ using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+ using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+ using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+ using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+ using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+ using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+ using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+ using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+ using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+ using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+ using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+ using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+ using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+ using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+ using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+ using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+ using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+ using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+
+ // 4x8
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DcDefs::_4x8::DcTop;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DcDefs::_4x8::DcLeft;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DcDefs::_4x8::Dc;
+
+ // 4x16
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DcDefs::_4x16::DcTop;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DcDefs::_4x16::DcLeft;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DcDefs::_4x16::Dc;
+
+ // 8x4
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DcDefs::_8x4::DcTop;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DcDefs::_8x4::DcLeft;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DcDefs::_8x4::Dc;
+
+ // 8x8
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DcDefs::_8x8::DcTop;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DcDefs::_8x8::DcLeft;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DcDefs::_8x8::Dc;
+
+ // 8x16
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DcDefs::_8x16::DcTop;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DcDefs::_8x16::DcLeft;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DcDefs::_8x16::Dc;
+
+ // 8x32
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DcDefs::_8x32::DcTop;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DcDefs::_8x32::DcLeft;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DcDefs::_8x32::Dc;
+
+ // 16x4
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DcDefs::_16x4::DcTop;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DcDefs::_16x4::DcLeft;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DcDefs::_16x4::Dc;
+
+ // 16x8
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DcDefs::_16x8::DcTop;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DcDefs::_16x8::DcLeft;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DcDefs::_16x8::Dc;
+
+ // 16x16
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DcDefs::_16x16::DcTop;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DcDefs::_16x16::DcLeft;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DcDefs::_16x16::Dc;
+
+ // 16x32
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DcDefs::_16x32::DcTop;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DcDefs::_16x32::DcLeft;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DcDefs::_16x32::Dc;
+
+ // 16x64
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DcDefs::_16x64::DcTop;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DcDefs::_16x64::DcLeft;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DcDefs::_16x64::Dc;
+
+ // 32x8
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DcDefs::_32x8::DcTop;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DcDefs::_32x8::DcLeft;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DcDefs::_32x8::Dc;
+
+ // 32x16
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DcDefs::_32x16::DcTop;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DcDefs::_32x16::DcLeft;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DcDefs::_32x16::Dc;
+
+ // 32x32
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DcDefs::_32x32::DcTop;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DcDefs::_32x32::DcLeft;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DcDefs::_32x32::Dc;
+
+ // 32x64
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DcDefs::_32x64::DcTop;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DcDefs::_32x64::DcLeft;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DcDefs::_32x64::Dc;
+
+ // 64x16
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DcDefs::_64x16::DcTop;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DcDefs::_64x16::DcLeft;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DcDefs::_64x16::Dc;
+
+ // 64x32
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DcDefs::_64x32::DcTop;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DcDefs::_64x32::DcLeft;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DcDefs::_64x32::Dc;
+
+ // 64x64
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DcDefs::_64x64::DcTop;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DcDefs::_64x64::DcLeft;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DcDefs::_64x64::Dc;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
new file mode 100644
index 0000000..16f858c
--- /dev/null
+++ b/src/dsp/arm/intrapred_neon.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor, see the defines below for specifics. These
+// functions are not thread-safe.
+void IntraPredCflInit_NEON();
+void IntraPredDirectionalInit_NEON();
+void IntraPredFilterIntraInit_NEON();
+void IntraPredInit_NEON();
+void IntraPredSmoothInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 8 bit
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+// 10 bit
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
new file mode 100644
index 0000000..abc93e8
--- /dev/null
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -0,0 +1,616 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+ // block dimension = 4
+ 255, 149, 85, 64,
+ // block dimension = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // block dimension = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // block dimension = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // block dimension = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+ 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+ 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+// TODO(b/150459137): Keeping the intermediate values in uint16_t would allow
+// processing more values at once. At the high end, it could do 4x4 or 8x2 at a
+// time.
+inline uint16x4_t CalculatePred(const uint16x4_t weighted_top,
+ const uint16x4_t weighted_left,
+ const uint16x4_t weighted_bl,
+ const uint16x4_t weighted_tr) {
+ const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left);
+ const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr);
+ const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1);
+ return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1);
+}
+
+template <int width, int height>
+inline void Smooth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ uint8x8_t top_v;
+ if (width == 4) {
+ top_v = Load4(top);
+ } else { // width == 8
+ top_v = vld1_u8(top);
+ }
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+ // Over-reads for 4xN but still within the array.
+ const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
+ // 256 - weights = vneg_s8(weights)
+ const uint8x8_t scaled_weights_x =
+ vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+ const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
+ const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+ const uint16x4_t dest_0 =
+ CalculatePred(vget_low_u16(weighted_top), vget_low_u16(weighted_left),
+ vget_low_u16(weighted_tr), vget_low_u16(weighted_bl));
+
+ if (width == 4) {
+ StoreLo4(dst, vmovn_u16(vcombine_u16(dest_0, dest_0)));
+ } else { // width == 8
+ const uint16x4_t dest_1 = CalculatePred(
+ vget_high_u16(weighted_top), vget_high_u16(weighted_left),
+ vget_high_u16(weighted_tr), vget_high_u16(weighted_bl));
+ vst1_u8(dst, vmovn_u16(vcombine_u16(dest_0, dest_1)));
+ }
+ dst += stride;
+ }
+}
+
+inline uint8x16_t CalculateWeightsAndPred(
+ const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+ const uint8x8_t weights_y, const uint8x16_t weights_x,
+ const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+ const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+ const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+ const uint16x8_t weighted_tr_low =
+ vmull_u8(vget_low_u8(scaled_weights_x), top_right);
+ const uint16x4_t dest_0 = CalculatePred(
+ vget_low_u16(weighted_top_low), vget_low_u16(weighted_left_low),
+ vget_low_u16(weighted_tr_low), vget_low_u16(weighted_bl));
+ const uint16x4_t dest_1 = CalculatePred(
+ vget_high_u16(weighted_top_low), vget_high_u16(weighted_left_low),
+ vget_high_u16(weighted_tr_low), vget_high_u16(weighted_bl));
+ const uint8x8_t dest_0_u8 = vmovn_u16(vcombine_u16(dest_0, dest_1));
+
+ const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+ const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+ const uint16x8_t weighted_tr_high =
+ vmull_u8(vget_high_u8(scaled_weights_x), top_right);
+ const uint16x4_t dest_2 = CalculatePred(
+ vget_low_u16(weighted_top_high), vget_low_u16(weighted_left_high),
+ vget_low_u16(weighted_tr_high), vget_low_u16(weighted_bl));
+ const uint16x4_t dest_3 = CalculatePred(
+ vget_high_u16(weighted_top_high), vget_high_u16(weighted_left_high),
+ vget_high_u16(weighted_tr_high), vget_high_u16(weighted_bl));
+ const uint8x8_t dest_1_u8 = vmovn_u16(vcombine_u16(dest_2, dest_3));
+
+ return vcombine_u8(dest_0_u8, dest_1_u8);
+}
+
+template <int width, int height>
+inline void Smooth16PlusxN_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ uint8x16_t top_v[4];
+ top_v[0] = vld1q_u8(top);
+ if (width > 16) {
+ top_v[1] = vld1q_u8(top + 16);
+ if (width == 64) {
+ top_v[2] = vld1q_u8(top + 32);
+ top_v[3] = vld1q_u8(top + 48);
+ }
+ }
+
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+ // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop.
+ // This currently has a performance slope similar to Paeth so it does not
+ // appear to be register bound for arm64.
+ uint8x16_t weights_x_v[4];
+ weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
+ if (width > 16) {
+ weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+ if (width == 64) {
+ weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+ weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+ }
+ }
+
+ uint8x16_t scaled_weights_x[4];
+ scaled_weights_x[0] =
+ vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0])));
+ if (width > 16) {
+ scaled_weights_x[1] =
+ vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1])));
+ if (width == 64) {
+ scaled_weights_x[2] =
+ vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2])));
+ scaled_weights_x[3] =
+ vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3])));
+ }
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+ vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
+ weights_y_v, weights_x_v[0],
+ scaled_weights_x[0], weighted_bl));
+
+ if (width > 16) {
+ vst1q_u8(dst + 16, CalculateWeightsAndPred(
+ top_v[1], left_v, top_right_v, weights_y_v,
+ weights_x_v[1], scaled_weights_x[1], weighted_bl));
+ if (width == 64) {
+ vst1q_u8(dst + 32,
+ CalculateWeightsAndPred(top_v[2], left_v, top_right_v,
+ weights_y_v, weights_x_v[2],
+ scaled_weights_x[2], weighted_bl));
+ vst1q_u8(dst + 48,
+ CalculateWeightsAndPred(top_v[3], left_v, top_right_v,
+ weights_y_v, weights_x_v[3],
+ scaled_weights_x[3], weighted_bl));
+ }
+ }
+
+ dst += stride;
+ }
+}
+
+template <int width, int height>
+inline void SmoothVertical4Or8xN_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ uint8x8_t top_v;
+ if (width == 4) {
+ top_v = Load4(top);
+ } else { // width == 8
+ top_v = vld1_u8(top);
+ }
+
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+
+ const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+ const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl);
+ const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+
+ if (width == 4) {
+ StoreLo4(dst, pred_scaled);
+ } else { // width == 8
+ vst1_u8(dst, pred_scaled);
+ }
+ dst += stride;
+ }
+}
+
+inline uint8x16_t CalculateVerticalWeightsAndPred(
+ const uint8x16_t top, const uint8x8_t weights_y,
+ const uint16x8_t weighted_bl) {
+ const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+ const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+ const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl);
+ const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl);
+ const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(pred_high, kSmoothWeightScale);
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+inline void SmoothVertical16PlusxN_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ uint8x16_t top_v[4];
+ top_v[0] = vld1q_u8(top);
+ if (width > 16) {
+ top_v[1] = vld1q_u8(top + 16);
+ if (width == 64) {
+ top_v[2] = vld1q_u8(top + 32);
+ top_v[3] = vld1q_u8(top + 48);
+ }
+ }
+
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+ const uint8x16_t pred_0 =
+ CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl);
+ vst1q_u8(dst, pred_0);
+
+ if (width > 16) {
+ const uint8x16_t pred_1 =
+ CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl);
+ vst1q_u8(dst + 16, pred_1);
+
+ if (width == 64) {
+ const uint8x16_t pred_2 =
+ CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl);
+ vst1q_u8(dst + 32, pred_2);
+
+ const uint8x16_t pred_3 =
+ CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl);
+ vst1q_u8(dst + 48, pred_3);
+ }
+ }
+
+ dst += stride;
+ }
+}
+
+template <int width, int height>
+inline void SmoothHorizontal4Or8xN_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ // Over-reads for 4xN but still within the array.
+ const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
+ // 256 - weights = vneg_s8(weights)
+ const uint8x8_t scaled_weights_x =
+ vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x)));
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+
+ const uint16x8_t weighted_left = vmull_u8(weights_x, left_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+ const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr);
+ const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+
+ if (width == 4) {
+ StoreLo4(dst, pred_scaled);
+ } else { // width == 8
+ vst1_u8(dst, pred_scaled);
+ }
+ dst += stride;
+ }
+}
+
+inline uint8x16_t CalculateHorizontalWeightsAndPred(
+ const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+ const uint8x16_t scaled_weights_x) {
+ const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+ const uint16x8_t weighted_tr_low =
+ vmull_u8(vget_low_u8(scaled_weights_x), top_right);
+ const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low);
+ const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+
+ const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+ const uint16x8_t weighted_tr_high =
+ vmull_u8(vget_high_u8(scaled_weights_x), top_right);
+ const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(pred_high, kSmoothWeightScale);
+
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+inline void SmoothHorizontal16PlusxN_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const uint8_t* const top = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+
+ uint8x16_t weights_x[4];
+ weights_x[0] = vld1q_u8(kSmoothWeights + width - 4);
+ if (width > 16) {
+ weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+ if (width == 64) {
+ weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+ weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+ }
+ }
+
+ uint8x16_t scaled_weights_x[4];
+ scaled_weights_x[0] =
+ vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0])));
+ if (width > 16) {
+ scaled_weights_x[1] =
+ vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1])));
+ if (width == 64) {
+ scaled_weights_x[2] =
+ vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2])));
+ scaled_weights_x[3] =
+ vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3])));
+ }
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+
+ const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred(
+ left_v, top_right_v, weights_x[0], scaled_weights_x[0]);
+ vst1q_u8(dst, pred_0);
+
+ if (width > 16) {
+ const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred(
+ left_v, top_right_v, weights_x[1], scaled_weights_x[1]);
+ vst1q_u8(dst + 16, pred_1);
+
+ if (width == 64) {
+ const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred(
+ left_v, top_right_v, weights_x[2], scaled_weights_x[2]);
+ vst1q_u8(dst + 32, pred_2);
+
+ const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred(
+ left_v, top_right_v, weights_x[3], scaled_weights_x[3]);
+ vst1q_u8(dst + 48, pred_3);
+ }
+ }
+ dst += stride;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ // 4x4
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Smooth4Or8xN_NEON<4, 4>;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<4, 4>;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<4, 4>;
+
+ // 4x8
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Smooth4Or8xN_NEON<4, 8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<4, 8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<4, 8>;
+
+ // 4x16
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Smooth4Or8xN_NEON<4, 16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<4, 16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<4, 16>;
+
+ // 8x4
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Smooth4Or8xN_NEON<8, 4>;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<8, 4>;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<8, 4>;
+
+ // 8x8
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Smooth4Or8xN_NEON<8, 8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<8, 8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<8, 8>;
+
+ // 8x16
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Smooth4Or8xN_NEON<8, 16>;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<8, 16>;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<8, 16>;
+
+ // 8x32
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Smooth4Or8xN_NEON<8, 32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<8, 32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<8, 32>;
+
+ // 16x4
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 4>;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 4>;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 4>;
+
+ // 16x8
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 8>;
+
+ // 16x16
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 16>;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 16>;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 16>;
+
+ // 16x32
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 32>;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 32>;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 32>;
+
+ // 16x64
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 64>;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 64>;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 64>;
+
+ // 32x8
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<32, 8>;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<32, 8>;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<32, 8>;
+
+ // 32x16
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<32, 16>;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<32, 16>;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<32, 16>;
+
+ // 32x32
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<32, 32>;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<32, 32>;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<32, 32>;
+
+ // 32x64
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<32, 64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<32, 64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<32, 64>;
+
+ // 64x16
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<64, 16>;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<64, 16>;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<64, 16>;
+
+ // 64x32
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<64, 32>;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<64, 32>;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<64, 32>;
+
+ // 64x64
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<64, 64>;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<64, 64>;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<64, 64>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
new file mode 100644
index 0000000..072991a
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -0,0 +1,3128 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+// TODO(slavarnway): Move transpose functions to transpose_neon.h or
+// common_neon.h.
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4],
+ int16x8_t out[4]) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ const int16x4_t a0 = vget_low_s16(in[0]);
+ const int16x4_t a1 = vget_low_s16(in[1]);
+ const int16x4_t a2 = vget_low_s16(in[2]);
+ const int16x4_t a3 = vget_low_s16(in[3]);
+
+ const int16x4x2_t b0 = vtrn_s16(a0, a1);
+ const int16x4x2_t b1 = vtrn_s16(a2, a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+
+ const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]);
+ const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]);
+ const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]);
+ const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]);
+
+ out[0] = vcombine_s16(d0, d0);
+ out[1] = vcombine_s16(d1, d1);
+ out[2] = vcombine_s16(d2, d2);
+ out[3] = vcombine_s16(d3, d3);
+}
+
+// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
+// place version causes additional stack usage with clang.
+LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
+ int16x8_t out[8]) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+ const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+ const int16x8x2_t b2 = vtrnq_s16(in[4], in[5]);
+ const int16x8x2_t b3 = vtrnq_s16(in[6], in[7]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+ const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+ out[0] = d0.val[0];
+ out[1] = d1.val[0];
+ out[2] = d2.val[0];
+ out[3] = d3.val[0];
+ out[4] = d0.val[1];
+ out[5] = d1.val[1];
+ out[6] = d2.val[1];
+ out[7] = d3.val[1];
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const uint16x8_t in[8],
+ uint16x8_t out[4]) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ uint16x4x2_t b0 = vtrn_u16(vget_low_u16(in[0]), vget_low_u16(in[1]));
+ uint16x4x2_t b1 = vtrn_u16(vget_low_u16(in[2]), vget_low_u16(in[3]));
+ uint16x4x2_t b2 = vtrn_u16(vget_low_u16(in[4]), vget_low_u16(in[5]));
+ uint16x4x2_t b3 = vtrn_u16(vget_low_u16(in[6]), vget_low_u16(in[7]));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b1.val[0]));
+ uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+ vreinterpret_u32_u16(b1.val[1]));
+ uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+ vreinterpret_u32_u16(b3.val[0]));
+ uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+ vreinterpret_u32_u16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // o0: 00 10 20 30 40 50 60 70
+ // o1: 01 11 21 31 41 51 61 71
+ // o2: 02 12 22 32 42 52 62 72
+ // o3: 03 13 23 33 43 53 63 73
+
+ out[0] = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+ vreinterpret_u16_u32(c2.val[0]));
+ out[1] = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+ vreinterpret_u16_u32(c3.val[0]));
+ out[2] = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+ vreinterpret_u16_u32(c2.val[1]));
+ out[3] = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+ vreinterpret_u16_u32(c3.val[1]));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const int16x8_t in[8],
+ int16x8_t out[4]) {
+ Transpose4x8To8x4(reinterpret_cast<const uint16x8_t*>(in),
+ reinterpret_cast<uint16x8_t*>(out));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8(const int16x8_t in[4],
+ int16x8_t out[8]) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+ const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+
+ // The upper 8 bytes are don't cares.
+ // out[0]: 00 10 20 30 04 14 24 34
+ // out[1]: 01 11 21 31 05 15 25 35
+ // out[2]: 02 12 22 32 06 16 26 36
+ // out[3]: 03 13 23 33 07 17 27 37
+ // out[4]: 04 14 24 34 04 14 24 34
+ // out[5]: 05 15 25 35 05 15 25 35
+ // out[6]: 06 16 26 36 06 16 26 36
+ // out[7]: 07 17 27 37 07 17 27 37
+ out[0] = vreinterpretq_s16_s32(c0.val[0]);
+ out[1] = vreinterpretq_s16_s32(c1.val[0]);
+ out[2] = vreinterpretq_s16_s32(c0.val[1]);
+ out[3] = vreinterpretq_s16_s32(c1.val[1]);
+ out[4] = vreinterpretq_s16_s32(
+ vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[0])));
+ out[5] = vreinterpretq_s16_s32(
+ vcombine_s32(vget_high_s32(c1.val[0]), vget_high_s32(c1.val[0])));
+ out[6] = vreinterpretq_s16_s32(
+ vcombine_s32(vget_high_s32(c0.val[1]), vget_high_s32(c0.val[1])));
+ out[7] = vreinterpretq_s16_s32(
+ vcombine_s32(vget_high_s32(c1.val[1]), vget_high_s32(c1.val[1])));
+}
+
+//------------------------------------------------------------------------------
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
+ const int16x8_t* const s) {
+ assert(store_count % 4 == 0);
+ assert(store_width == 8 || store_width == 16);
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (store_width == 16) {
+ for (int i = 0; i < store_count; i += 4) {
+ vst1q_s16(&dst[i * stride + idx], (s[i]));
+ vst1q_s16(&dst[(i + 1) * stride + idx], (s[i + 1]));
+ vst1q_s16(&dst[(i + 2) * stride + idx], (s[i + 2]));
+ vst1q_s16(&dst[(i + 3) * stride + idx], (s[i + 3]));
+ }
+ } else {
+ // store_width == 8
+ for (int i = 0; i < store_count; i += 4) {
+ vst1_s16(&dst[i * stride + idx], vget_low_s16(s[i]));
+ vst1_s16(&dst[(i + 1) * stride + idx], vget_low_s16(s[i + 1]));
+ vst1_s16(&dst[(i + 2) * stride + idx], vget_low_s16(s[i + 2]));
+ vst1_s16(&dst[(i + 3) * stride + idx], vget_low_s16(s[i + 3]));
+ }
+ }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
+ int32_t idx, int16x8_t* x) {
+ assert(load_count % 4 == 0);
+ assert(load_width == 8 || load_width == 16);
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (load_width == 16) {
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = vld1q_s16(&src[i * stride + idx]);
+ x[i + 1] = vld1q_s16(&src[(i + 1) * stride + idx]);
+ x[i + 2] = vld1q_s16(&src[(i + 2) * stride + idx]);
+ x[i + 3] = vld1q_s16(&src[(i + 3) * stride + idx]);
+ }
+ } else {
+ // load_width == 8
+ const int64x2_t zero = vdupq_n_s64(0);
+ for (int i = 0; i < load_count; i += 4) {
+ // The src buffer is aligned to 32 bytes. Each load will always be 8
+ // byte aligned.
+ x[i] = vreinterpretq_s16_s64(vld1q_lane_s64(
+ reinterpret_cast<const int64_t*>(&src[i * stride + idx]), zero, 0));
+ x[i + 1] = vreinterpretq_s16_s64(vld1q_lane_s64(
+ reinterpret_cast<const int64_t*>(&src[(i + 1) * stride + idx]), zero,
+ 0));
+ x[i + 2] = vreinterpretq_s16_s64(vld1q_lane_s64(
+ reinterpret_cast<const int64_t*>(&src[(i + 2) * stride + idx]), zero,
+ 0));
+ x[i + 3] = vreinterpretq_s16_s64(vld1q_lane_s64(
+ reinterpret_cast<const int64_t*>(&src[(i + 3) * stride + idx]), zero,
+ 0));
+ }
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int16x8_t* a, int16x8_t* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+ const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+ const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+ const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+ const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+ const int16x8_t x = vcombine_s16(x1, x1);
+ const int16x8_t y = vcombine_s16(y1, y1);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(int16x8_t* a, int16x8_t* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+ const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+ const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+ const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+ const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+ const int32x4_t acc_x_hi = vmull_n_s16(vget_high_s16(*a), cos128);
+ const int32x4_t acc_y_hi = vmull_n_s16(vget_high_s16(*a), sin128);
+ const int32x4_t x0_hi = vmlsl_n_s16(acc_x_hi, vget_high_s16(*b), sin128);
+ const int32x4_t y0_hi = vmlal_n_s16(acc_y_hi, vget_high_s16(*b), cos128);
+ const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+ const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+ const int16x8_t x = vcombine_s16(x1, x1_hi);
+ const int16x8_t y = vcombine_s16(y1, y1_hi);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a,
+ int16x8_t* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ // For this function, the max value returned by Sin128() is 4091, which fits
+ // inside 12 bits. This leaves room for the sign bit and the 3 left shifted
+ // bits.
+ assert(sin128 <= 0xfff);
+ const int16x8_t x = vqrdmulhq_n_s16(*b, -sin128 << 3);
+ const int16x8_t y = vqrdmulhq_n_s16(*b, cos128 << 3);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int16x8_t* a,
+ int16x8_t* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const int16x8_t x = vqrdmulhq_n_s16(*a, cos128 << 3);
+ const int16x8_t y = vqrdmulhq_n_s16(*a, sin128 << 3);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int16x8_t* a, int16x8_t* b,
+ bool flip) {
+ int16x8_t x, y;
+ if (flip) {
+ y = vqaddq_s16(*b, *a);
+ x = vqsubq_s16(*b, *a);
+ } else {
+ x = vqaddq_s16(*a, *b);
+ y = vqsubq_s16(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(int16x8_t* a, int16x8_t* b, int angle,
+ bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
+ const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+ const int16x8_t v_src_round =
+ vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+ const int16x8_t s0 = vbslq_s16(v_mask, v_src_round, v_src);
+ const int16_t cos128 = Cos128(32);
+ const int16x8_t xy = vqrdmulhq_n_s16(s0, cos128 << 3);
+ // vqrshlq_s16 will shift right if shift value is negative.
+ const int16x8_t xy_shifted = vqrshlq_s16(xy, vdupq_n_s16(-row_shift));
+
+ if (width == 4) {
+ vst1_s16(dst, vget_low_s16(xy_shifted));
+ } else {
+ for (int i = 0; i < width; i += 8) {
+ vst1q_s16(dst, xy_shifted);
+ dst += 8;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const int16x4_t v_src = vld1_s16(dst);
+ const int16x4_t xy = vqrdmulh_n_s16(v_src, cos128 << 3);
+ vst1_s16(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const int16x8_t v_src = vld1q_s16(&dst[i]);
+ const int16x8_t xy = vqrdmulhq_n_s16(v_src, cos128 << 3);
+ vst1q_s16(&dst[i], xy);
+ i += 8;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int16x8_t* s) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[4], x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t input[8];
+ LoadSrc<8, 8>(dst, step, 0, input);
+ Transpose4x8To8x4(input, x);
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ if (transpose) {
+ Transpose4x4(x, x);
+ }
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t output[8];
+ Transpose8x4To4x8(s, output);
+ StoreDst<8, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ Transpose4x4(s, s);
+ }
+ StoreDst<8, 4>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int16x8_t* s) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false);
+ HadamardRotation(&s[6], &s[7], true);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else if (transpose) {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ dsp::Transpose8x8(x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t output[4];
+ Transpose4x8To8x4(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, s);
+ }
+ } else if (transpose) {
+ dsp::Transpose8x8(s);
+ StoreDst<16, 8>(dst, step, 0, s);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int16x8_t* s) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false);
+ HadamardRotation(&s[10], &s[11], true);
+ HadamardRotation(&s[12], &s[13], false);
+ HadamardRotation(&s[14], &s[15], true);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false);
+ HadamardRotation(&s[9], &s[10], false);
+ HadamardRotation(&s[12], &s[15], true);
+ HadamardRotation(&s[13], &s[14], true);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (is_row) {
+ int16x8_t input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+ Dct16Stages<butterfly_rotation>(s);
+
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ for (int i = 0; i < 16; ++i) {
+ s[i] = vqrshlq_s16(s[i], v_row_shift);
+ }
+ }
+
+ if (stage_is_rectangular) {
+ if (is_row) {
+ int16x8_t output[4];
+ Transpose4x8To8x4(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4(&s[8], output);
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, s);
+ }
+ } else if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ dsp::Transpose8x8(&s[idx]);
+ StoreDst<16, 8>(dst, step, idx, &s[idx]);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int16x8_t* s) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false);
+ HadamardRotation(&s[18], &s[19], true);
+ HadamardRotation(&s[20], &s[21], false);
+ HadamardRotation(&s[22], &s[23], true);
+ HadamardRotation(&s[24], &s[25], false);
+ HadamardRotation(&s[26], &s[27], true);
+ HadamardRotation(&s[28], &s[29], false);
+ HadamardRotation(&s[30], &s[31], true);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false);
+ HadamardRotation(&s[17], &s[18], false);
+ HadamardRotation(&s[20], &s[23], true);
+ HadamardRotation(&s[21], &s[22], true);
+ HadamardRotation(&s[24], &s[27], false);
+ HadamardRotation(&s[25], &s[26], false);
+ HadamardRotation(&s[28], &s[31], true);
+ HadamardRotation(&s[29], &s[30], true);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false);
+ HadamardRotation(&s[17], &s[22], false);
+ HadamardRotation(&s[18], &s[21], false);
+ HadamardRotation(&s[19], &s[20], false);
+ HadamardRotation(&s[24], &s[31], true);
+ HadamardRotation(&s[25], &s[30], true);
+ HadamardRotation(&s[26], &s[29], true);
+ HadamardRotation(&s[27], &s[28], true);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+ const bool is_row, int row_shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[32], x[32];
+
+ if (is_row) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
+ }
+ } else {
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_8>(s);
+ Dct8Stages<ButterflyRotation_8>(s);
+ Dct16Stages<ButterflyRotation_8>(s);
+ Dct32Stages<ButterflyRotation_8>(s);
+
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ for (int idx = 0; idx < 32; idx += 8) {
+ int16x8_t output[8];
+ Transpose8x8(&s[idx], output);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vqrshlq_s16(output[i], v_row_shift);
+ }
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 32>(dst, step, 0, s);
+ }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[64], x[32];
+
+ if (is_row) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false);
+ HadamardRotation(&s[34], &s[35], true);
+ HadamardRotation(&s[36], &s[37], false);
+ HadamardRotation(&s[38], &s[39], true);
+ HadamardRotation(&s[40], &s[41], false);
+ HadamardRotation(&s[42], &s[43], true);
+ HadamardRotation(&s[44], &s[45], false);
+ HadamardRotation(&s[46], &s[47], true);
+ HadamardRotation(&s[48], &s[49], false);
+ HadamardRotation(&s[50], &s[51], true);
+ HadamardRotation(&s[52], &s[53], false);
+ HadamardRotation(&s[54], &s[55], true);
+ HadamardRotation(&s[56], &s[57], false);
+ HadamardRotation(&s[58], &s[59], true);
+ HadamardRotation(&s[60], &s[61], false);
+ HadamardRotation(&s[62], &s[63], true);
+
+ // stage 7.
+ ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false);
+ HadamardRotation(&s[33], &s[34], false);
+ HadamardRotation(&s[36], &s[39], true);
+ HadamardRotation(&s[37], &s[38], true);
+ HadamardRotation(&s[40], &s[43], false);
+ HadamardRotation(&s[41], &s[42], false);
+ HadamardRotation(&s[44], &s[47], true);
+ HadamardRotation(&s[45], &s[46], true);
+ HadamardRotation(&s[48], &s[51], false);
+ HadamardRotation(&s[49], &s[50], false);
+ HadamardRotation(&s[52], &s[55], true);
+ HadamardRotation(&s[53], &s[54], true);
+ HadamardRotation(&s[56], &s[59], false);
+ HadamardRotation(&s[57], &s[58], false);
+ HadamardRotation(&s[60], &s[63], true);
+ HadamardRotation(&s[61], &s[62], true);
+
+ // stage 16.
+ ButterflyRotation_8(&s[61], &s[34], 56, true);
+ ButterflyRotation_8(&s[60], &s[35], 56, true);
+ ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false);
+ HadamardRotation(&s[33], &s[38], false);
+ HadamardRotation(&s[34], &s[37], false);
+ HadamardRotation(&s[35], &s[36], false);
+ HadamardRotation(&s[40], &s[47], true);
+ HadamardRotation(&s[41], &s[46], true);
+ HadamardRotation(&s[42], &s[45], true);
+ HadamardRotation(&s[43], &s[44], true);
+ HadamardRotation(&s[48], &s[55], false);
+ HadamardRotation(&s[49], &s[54], false);
+ HadamardRotation(&s[50], &s[53], false);
+ HadamardRotation(&s[51], &s[52], false);
+ HadamardRotation(&s[56], &s[63], true);
+ HadamardRotation(&s[57], &s[62], true);
+ HadamardRotation(&s[58], &s[61], true);
+ HadamardRotation(&s[59], &s[60], true);
+
+ // stage 25.
+ ButterflyRotation_8(&s[59], &s[36], 48, true);
+ ButterflyRotation_8(&s[58], &s[37], 48, true);
+ ButterflyRotation_8(&s[57], &s[38], 48, true);
+ ButterflyRotation_8(&s[56], &s[39], 48, true);
+ ButterflyRotation_8(&s[55], &s[40], 112, true);
+ ButterflyRotation_8(&s[54], &s[41], 112, true);
+ ButterflyRotation_8(&s[53], &s[42], 112, true);
+ ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false);
+ HadamardRotation(&s[33], &s[46], false);
+ HadamardRotation(&s[34], &s[45], false);
+ HadamardRotation(&s[35], &s[44], false);
+ HadamardRotation(&s[36], &s[43], false);
+ HadamardRotation(&s[37], &s[42], false);
+ HadamardRotation(&s[38], &s[41], false);
+ HadamardRotation(&s[39], &s[40], false);
+ HadamardRotation(&s[48], &s[63], true);
+ HadamardRotation(&s[49], &s[62], true);
+ HadamardRotation(&s[50], &s[61], true);
+ HadamardRotation(&s[51], &s[60], true);
+ HadamardRotation(&s[52], &s[59], true);
+ HadamardRotation(&s[53], &s[58], true);
+ HadamardRotation(&s[54], &s[57], true);
+ HadamardRotation(&s[55], &s[56], true);
+
+ // stage 30.
+ ButterflyRotation_8(&s[55], &s[40], 32, true);
+ ButterflyRotation_8(&s[54], &s[41], 32, true);
+ ButterflyRotation_8(&s[53], &s[42], 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 32, true);
+ ButterflyRotation_8(&s[50], &s[45], 32, true);
+ ButterflyRotation_8(&s[49], &s[46], 32, true);
+ ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+ }
+ //-- end dct 64 stages
+
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ for (int idx = 0; idx < 64; idx += 8) {
+ int16x8_t output[8];
+ Transpose8x8(&s[idx], output);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vqrshlq_s16(output[i], v_row_shift);
+ }
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 64>(dst, step, 0, s);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int32x4_t s[8];
+ int16x8_t x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t input[8];
+ LoadSrc<8, 8>(dst, step, 0, input);
+ Transpose4x8To8x4(input, x);
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ if (transpose) {
+ Transpose4x4(x, x);
+ }
+ }
+
+ // stage 1.
+ s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]);
+ s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]);
+
+ // stage 2.
+ const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2]));
+ const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3]));
+
+ // stage 3.
+ s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]);
+ s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]);
+ // s[0] = s[0] + s[3]
+ s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]);
+ // s[1] = s[1] - s[4]
+ s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]);
+
+ s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]);
+ s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+ // stage 4.
+ s[0] = vaddq_s32(s[0], s[5]);
+ s[1] = vsubq_s32(s[1], s[6]);
+
+ // stages 5 and 6.
+ const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+ const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+ const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+ const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+ const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+ const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12);
+ const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+ x[0] = vcombine_s16(dst_0, dst_0);
+ x[1] = vcombine_s16(dst_1, dst_1);
+ x[2] = vcombine_s16(dst_2, dst_2);
+ x[3] = vcombine_s16(dst_3, dst_3);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t output[8];
+ Transpose8x4To4x8(x, output);
+ StoreDst<8, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ Transpose4x4(x, x);
+ }
+ StoreDst<8, 4>(dst, step, 0, x);
+ }
+}
+
+alignas(8) constexpr int16_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+ 2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int32x4_t s[2];
+
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+ const int16x4_t v_src_round =
+ vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+ const int16x4_t kAdst4DcOnlyMultipliers = vld1_s16(kAdst4DcOnlyMultiplier);
+ s[1] = vdupq_n_s32(0);
+
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ s[0] = vmull_s16(kAdst4DcOnlyMultipliers, v_src);
+ // 0 0 0 s0*k0
+ s[1] = vextq_s32(s[1], s[0], 1);
+
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int16x4_t dst_0 = vqrshrn_n_s32(x3, 12);
+
+ // vqrshlq_s16 will shift right if shift value is negative.
+ vst1_s16(dst, vqrshl_s16(dst_0, vdup_n_s16(-row_shift)));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int32x4_t s[4];
+
+ int i = 0;
+ do {
+ const int16x4_t v_src = vld1_s16(&dst[i]);
+
+ s[0] = vmull_n_s16(v_src, kAdst4Multiplier[0]);
+ s[1] = vmull_n_s16(v_src, kAdst4Multiplier[1]);
+ s[2] = vmull_n_s16(v_src, kAdst4Multiplier[2]);
+
+ const int32x4_t x0 = s[0];
+ const int32x4_t x1 = s[1];
+ const int32x4_t x2 = s[2];
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+ const int16x4_t dst_2 = vqrshrn_n_s32(x2, 12);
+ const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+ vst1_s16(&dst[i], dst_0);
+ vst1_s16(&dst[i + width * 1], dst_1);
+ vst1_s16(&dst[i + width * 2], dst_2);
+ vst1_s16(&dst[i + width * 3], dst_3);
+
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ dsp::Transpose8x8(x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[3], &s[7], false);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s16(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s16(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s16(s[1]);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t output[4];
+ Transpose4x8To8x4(x, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ dsp::Transpose8x8(x);
+ StoreDst<16, 8>(dst, step, 0, x);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int16x8_t s[8];
+
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
+ const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+ const int16x8_t v_src_round =
+ vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+ // stage 1.
+ s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int16x8_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s16(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s16(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s16(s[1]);
+
+ for (int i = 0; i < 8; ++i) {
+ // vqrshlq_s16 will shift right if shift value is negative.
+ x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+ vst1q_lane_s16(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int16x8_t s[8];
+
+ int i = 0;
+ do {
+ const int16x8_t v_src = vld1q_s16(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int16x8_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s16(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s16(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s16(s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (is_row) {
+ int16x8_t input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false);
+ HadamardRotation(&s[1], &s[9], false);
+ HadamardRotation(&s[2], &s[10], false);
+ HadamardRotation(&s[3], &s[11], false);
+ HadamardRotation(&s[4], &s[12], false);
+ HadamardRotation(&s[5], &s[13], false);
+ HadamardRotation(&s[6], &s[14], false);
+ HadamardRotation(&s[7], &s[15], false);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[8], &s[12], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[9], &s[13], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[10], &s[14], false);
+ HadamardRotation(&s[3], &s[7], false);
+ HadamardRotation(&s[11], &s[15], false);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[8], &s[10], false);
+ HadamardRotation(&s[12], &s[14], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+ HadamardRotation(&s[9], &s[11], false);
+ HadamardRotation(&s[13], &s[15], false);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s16(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s16(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s16(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s16(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s16(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s16(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s16(s[1]);
+
+ if (stage_is_rectangular) {
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ int16x8_t output[4];
+ Transpose4x8To8x4(x, output);
+ for (int i = 0; i < 4; ++i) {
+ output[i] = vqrshlq_s16(output[i], v_row_shift);
+ }
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4(&x[8], output);
+ for (int i = 0; i < 4; ++i) {
+ output[i] = vqrshlq_s16(output[i], v_row_shift);
+ }
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ for (int idx = 0; idx < 16; idx += 8) {
+ int16x8_t output[8];
+ Transpose8x8(&x[idx], output);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vqrshlq_s16(output[i], v_row_shift);
+ }
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int16x8_t* s, int16x8_t* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s16(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s16(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s16(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s16(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s16(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s16(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s16(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int16x8_t s[16];
+ int16x8_t x[16];
+
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
+ const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+ const int16x8_t v_src_round =
+ vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+ // stage 1.
+ s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 16; ++i) {
+ // vqrshlq_s16 will shift right if shift value is negative.
+ x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+ vst1q_lane_s16(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int i = 0;
+ do {
+ int16x8_t s[16];
+ int16x8_t x[16];
+ const int16x8_t v_src = vld1q_s16(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ if (is_row_shift) {
+ const int shift = 1;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ for (int i = 0; i < 4; i += 2) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+ const int32x4_t v_src_mult_lo =
+ vmlal_s16(v_dual_round, vget_low_s16(v_src), v_multiplier);
+ const int32x4_t v_src_mult_hi =
+ vmlal_s16(v_dual_round, vget_high_s16(v_src), v_multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+ vst1q_s16(&dst[i * step],
+ vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+ }
+ } else {
+ for (int i = 0; i < 4; i += 2) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+ const int16x8_t a =
+ vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+ const int16x8_t b = vqaddq_s16(v_src, a);
+ vst1q_s16(&dst[i * step], b);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+ const int16x4_t v_src_round =
+ vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+ const int shift = tx_height < 16 ? 0 : 1;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ const int32x4_t v_src_mult_lo = vmlal_s16(v_dual_round, v_src, v_multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+ return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+
+ if (identity_size < 32) {
+ if (tx_width == 4) {
+ uint8x8_t frame_data = vdup_n_u8(0);
+ int i = 0;
+ do {
+ const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+
+ int16x4_t v_dst_i;
+ if (identity_size == 4) {
+ const int16x4_t v_src_fraction =
+ vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+ v_dst_i = vqadd_s16(v_src, v_src_fraction);
+ } else if (identity_size == 8) {
+ v_dst_i = vqadd_s16(v_src, v_src);
+ } else { // identity_size == 16
+ const int16x4_t v_src_mult =
+ vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+ const int16x4_t v_srcx2 = vqadd_s16(v_src, v_src);
+ v_dst_i = vqadd_s16(v_srcx2, v_src_mult);
+ }
+
+ frame_data = Load4<0>(dst, frame_data);
+ const int16x4_t a = vrshr_n_s16(v_dst_i, 4);
+ const uint16x8_t b =
+ vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ StoreLo4(dst, d);
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const int16x8_t v_src = vld1q_s16(&source[row + j]);
+
+ int16x8_t v_dst_i;
+ if (identity_size == 4) {
+ const int16x8_t v_src_fraction =
+ vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+ v_dst_i = vqaddq_s16(v_src, v_src_fraction);
+ } else if (identity_size == 8) {
+ v_dst_i = vqaddq_s16(v_src, v_src);
+ } else { // identity_size == 16
+ const int16x8_t v_src_mult =
+ vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+ const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+ v_dst_i = vqaddq_s16(v_src_mult, v_srcx2);
+ }
+
+ const uint8x8_t frame_data = vld1_u8(dst + j);
+ const int16x8_t a = vrshrq_n_s16(v_dst_i, 4);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ vst1_u8(dst + j, d);
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const int16x8_t v_dst_i = vld1q_s16(&source[row + j]);
+ const uint8x8_t frame_data = vld1_u8(dst + j);
+ const int16x8_t a = vrshrq_n_s16(v_dst_i, 2);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ vst1_u8(dst + j, d);
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+
+ if (tx_width == 4) {
+ uint8x8_t frame_data = vdup_n_u8(0);
+ int i = 0;
+ do {
+ const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+ const int16x4_t v_src_mult =
+ vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+ const int16x4_t v_dst_row = vqadd_s16(v_src, v_src_mult);
+ const int16x4_t v_src_mult2 =
+ vqrdmulh_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+ const int16x4_t v_dst_col = vqadd_s16(v_dst_row, v_src_mult2);
+ frame_data = Load4<0>(dst, frame_data);
+ const int16x4_t a = vrshr_n_s16(v_dst_col, 4);
+ const uint16x8_t b =
+ vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ StoreLo4(dst, d);
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const int16x8_t v_src = vld1q_s16(&source[row + j]);
+ const int16x8_t v_src_round =
+ vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+ const int16x8_t v_dst_row = vqaddq_s16(v_src_round, v_src_round);
+ const int16x8_t v_src_mult2 =
+ vqrdmulhq_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+ const int16x8_t v_dst_col = vqaddq_s16(v_dst_row, v_src_mult2);
+ const uint8x8_t frame_data = vld1_u8(dst + j);
+ const int16x8_t a = vrshrq_n_s16(v_dst_col, 4);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ vst1_u8(dst + j, d);
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ for (int i = 0; i < 4; ++i) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+ const int16x8_t a = vrshrq_n_s16(v_src, 1);
+ vst1q_s16(&dst[i * step], a);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ for (int i = 0; i < 4; ++i) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+ vst1q_s16(&dst[i * step], v_srcx2);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+ const int16x4_t v_src_round =
+ vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+ const int32x4_t v_srcx2 = vaddl_s16(v_src, v_src);
+ const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step + j * 8]);
+ const int32x4_t v_src_mult_lo =
+ vmlal_n_s16(v_dual_round, vget_low_s16(v_src), kIdentity16Multiplier);
+ const int32x4_t v_src_mult_hi = vmlal_n_s16(
+ v_dual_round, vget_high_s16(v_src), kIdentity16Multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+ vst1q_s16(&dst[i * step + j * 8],
+ vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+ const int16x4_t v_src_round =
+ vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int16x4_t v_multiplier = vdup_n_s16(kIdentity16Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ const int32x4_t v_src_mult_lo =
+ vmlal_s16(v_dual_round, (v_src), v_multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+ const int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 32; j += 8) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step + j]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const int16x8_t v_dst_i = vqaddq_s16(v_src, v_src);
+ vst1q_s16(&dst[i * step + j], v_dst_i);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const int16x4_t v_src = vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ const int16x4_t v_dst_0 = vqadd_s16(v_src, v_src);
+ vst1_lane_s16(dst, v_dst_0, 0);
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Transposes a 4x4 matrix and then permutes the rows of the transposed matrix
+// for the WHT. The input matrix is in two "wide" int16x8_t variables. The
+// output matrix is in four int16x4_t variables.
+//
+// Input:
+// in[0]: 00 01 02 03 10 11 12 13
+// in[1]: 20 21 22 23 30 31 32 33
+// Output:
+// out[0]: 00 10 20 30
+// out[1]: 03 13 23 33
+// out[2]: 01 11 21 31
+// out[3]: 02 12 22 32
+LIBGAV1_ALWAYS_INLINE void TransposeAndPermute4x4WideInput(
+ const int16x8_t in[2], int16x4_t out[4]) {
+ // Swap 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03 10 11 12 13
+ // in[1]: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const int32x4x2_t b0 =
+ vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+
+ // Swap 16 bit elements. Goes from:
+ // vget_low_s32(b0.val[0]): 00 01 20 21
+ // vget_high_s32(b0.val[0]): 10 11 30 31
+ // vget_low_s32(b0.val[1]): 02 03 22 23
+ // vget_high_s32(b0.val[1]): 12 13 32 33
+ // to:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 01 11 21 32
+ // c1.val[0]: 02 12 22 32
+ // c1.val[1]: 03 13 23 33
+
+ const int16x4x2_t c0 =
+ vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+ vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+ const int16x4x2_t c1 =
+ vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+ vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+
+ out[0] = c0.val[0];
+ out[1] = c1.val[1];
+ out[2] = c0.val[1];
+ out[3] = c1.val[0];
+}
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* dst, const int dst_stride,
+ const void* source,
+ const int adjusted_tx_height) {
+ const auto* const src = static_cast<const int16_t*>(source);
+ int16x4_t s[4];
+
+ if (adjusted_tx_height == 1) {
+ // Special case: only src[0] is nonzero.
+ // src[0] 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ //
+ // After the row and column transforms are applied, we have:
+ // f h h h
+ // g i i i
+ // g i i i
+ // g i i i
+ // where f, g, h, i are computed as follows.
+ int16_t f = (src[0] >> 2) - (src[0] >> 3);
+ const int16_t g = f >> 1;
+ f = f - (f >> 1);
+ const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+ const int16_t i = (src[0] >> 4);
+ s[0] = vdup_n_s16(h);
+ s[0] = vset_lane_s16(f, s[0], 0);
+ s[1] = vdup_n_s16(i);
+ s[1] = vset_lane_s16(g, s[1], 0);
+ s[2] = s[3] = s[1];
+ } else {
+ // Load the 4x4 source in transposed form.
+ int16x4x4_t columns = vld4_s16(src);
+ // Shift right and permute the columns for the WHT.
+ s[0] = vshr_n_s16(columns.val[0], 2);
+ s[2] = vshr_n_s16(columns.val[1], 2);
+ s[3] = vshr_n_s16(columns.val[2], 2);
+ s[1] = vshr_n_s16(columns.val[3], 2);
+
+ // Row transforms.
+ s[0] = vadd_s16(s[0], s[2]);
+ s[3] = vsub_s16(s[3], s[1]);
+ int16x4_t e = vhsub_s16(s[0], s[3]); // e = (s[0] - s[3]) >> 1
+ s[1] = vsub_s16(e, s[1]);
+ s[2] = vsub_s16(e, s[2]);
+ s[0] = vsub_s16(s[0], s[1]);
+ s[3] = vadd_s16(s[3], s[2]);
+
+ int16x8_t x[2];
+ x[0] = vcombine_s16(s[0], s[1]);
+ x[1] = vcombine_s16(s[2], s[3]);
+ TransposeAndPermute4x4WideInput(x, s);
+
+ // Column transforms.
+ s[0] = vadd_s16(s[0], s[2]);
+ s[3] = vsub_s16(s[3], s[1]);
+ e = vhsub_s16(s[0], s[3]); // e = (s[0] - s[3]) >> 1
+ s[1] = vsub_s16(e, s[1]);
+ s[2] = vsub_s16(e, s[2]);
+ s[0] = vsub_s16(s[0], s[1]);
+ s[3] = vadd_s16(s[3], s[2]);
+ }
+
+ // Store to frame.
+ uint8x8_t frame_data = vdup_n_u8(0);
+ for (int row = 0; row < 4; row += 2) {
+ frame_data = Load4<0>(dst, frame_data);
+ frame_data = Load4<1>(dst + dst_stride, frame_data);
+ const int16x8_t residual = vcombine_s16(s[row], s[row + 1]);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(residual), frame_data);
+ frame_data = vqmovun_s16(vreinterpretq_s16_u16(b));
+ StoreLo4(dst, frame_data);
+ dst += dst_stride;
+ StoreHi4(dst, frame_data);
+ dst += dst_stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ const int16x8_t a = vld1q_s16(&source[i]);
+ const int16x8_t b = vld1q_s16(&source[i + 8]);
+ const int16x8_t c = vrev64q_s16(a);
+ const int16x8_t d = vrev64q_s16(b);
+ vst1q_s16(&source[i], vcombine_s16(vget_high_s16(d), vget_low_s16(d)));
+ vst1q_s16(&source[i + 8],
+ vcombine_s16(vget_high_s16(c), vget_low_s16(c)));
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ const int16x8_t a = vld1q_s16(&source[i]);
+ const int16x8_t b = vrev64q_s16(a);
+ vst1q_s16(&source[i], vcombine_s16(vget_high_s16(b), vget_low_s16(b)));
+ }
+ } else {
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ const int16x8_t a = vld1q_s16(&source[i]);
+ vst1q_s16(&source[i], vrev64q_s16(a));
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int16x8_t a = vld1q_s16(&source[i]);
+ const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+ vst1q_s16(&source[i], b);
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+ int j = 0;
+ do {
+ const int16x8_t a = vld1q_s16(&source[i * tx_width + j]);
+ const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+ vst1q_s16(&source[i * tx_width + j], b);
+ j += 8;
+ } while (j < non_zero_width);
+ } while (++i < num_rows);
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+ int row_shift) {
+ // vqrshlq_s16 will shift right if shift value is negative.
+ row_shift = -row_shift;
+
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int16x8_t residual = vld1q_s16(&source[i]);
+ vst1q_s16(&source[i], vqrshlq_s16(residual, vdupq_n_s16(row_shift)));
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ for (int j = 0; j < tx_width; j += 8) {
+ const int16x8_t residual = vld1q_s16(&source[i * tx_width + j]);
+ const int16x8_t residual_shifted =
+ vqrshlq_s16(residual, vdupq_n_s16(row_shift));
+ vst1q_s16(&source[i * tx_width + j], residual_shifted);
+ }
+ } while (++i < num_rows);
+ }
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int16_t* source, TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+
+ // Enable for 4x4, 4x8, 4x16
+ if (tx_height < 32 && tx_width == 4) {
+ uint8x8_t frame_data = vdup_n_u8(0);
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const int16x4_t residual = vld1_s16(&source[row]);
+ frame_data = Load4<0>(dst, frame_data);
+ const int16x4_t a = vrshr_n_s16(residual, 4);
+ const uint16x8_t b =
+ vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ StoreLo4(dst, d);
+ dst += stride;
+ }
+ // Enable for 8x4, 8x8, 8x16, 8x32
+ } else if (tx_height < 64 && tx_width == 8) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+ const int16x8_t residual = vld1q_s16(&source[row]);
+ const uint8x8_t frame_data = vld1_u8(dst);
+ const int16x8_t a = vrshrq_n_s16(residual, 4);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ vst1_u8(dst, d);
+ dst += stride;
+ }
+ // Remaining widths >= 16.
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const int16x8_t residual = vld1q_s16(&source[row + j]);
+ const int16x8_t residual_hi = vld1q_s16(&source[row + j + 8]);
+ const uint8x16_t frame_data = vld1q_u8(frame[y] + x);
+ const int16x8_t a = vrshrq_n_s16(residual, 4);
+ const int16x8_t a_hi = vrshrq_n_s16(residual_hi, 4);
+ const uint16x8_t b =
+ vaddw_u8(vreinterpretq_u16_s16(a), vget_low_u8(frame_data));
+ const uint16x8_t b_hi =
+ vaddw_u8(vreinterpretq_u16_s16(a_hi), vget_high_u8(frame_data));
+ vst1q_u8(frame[y] + x,
+ vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(b)),
+ vqmovun_s16(vreinterpretq_s16_u16(b_hi))));
+ j += 16;
+ } while (j < tx_width);
+ }
+ }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = (tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct4 rows in parallel.
+ Dct4_NEON<ButterflyRotation_4, false>(src, /*step=*/4, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_8, true>(data, /*step=*/4,
+ /*transpose=*/true);
+ data += 32;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct4 columns in parallel.
+ Dct4_NEON<ButterflyRotation_4, false>(src, tx_width, /*transpose=*/false);
+ } else {
+ // Process 8 1d dct4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_8, true>(data, tx_width,
+ /*transpose=*/false);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct8 rows in parallel.
+ Dct8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+ /*transpose=*/true);
+ data += 64;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct8 columns in parallel.
+ Dct8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d dct8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_8, false>(data, tx_width,
+ /*transpose=*/false);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct16 rows in parallel.
+ Dct16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+ } else {
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 8 1d dct16 rows in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+ row_shift);
+ src += 128;
+ i -= 8;
+ } while (i != 0);
+ }
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct16 columns in parallel.
+ Dct16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+ /*row_shift=*/0);
+ } else {
+ int i = tx_width;
+ auto* data = src;
+ do {
+ // Process 8 1d dct16 columns in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_8, false>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct32 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_NEON(&src[i * 32], 32, /*is_row=*/true, row_shift);
+ i += 8;
+ } while (i < adjusted_tx_height);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct32 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct64 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_NEON(&src[i * 64], 64, /*is_row=*/true, row_shift);
+ i += 8;
+ } while (i < adjusted_tx_height);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct64 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst4_NEON<false>(data, tx_width, /*transpose=*/false);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d adst8 rows in parallel.
+ Adst8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d adst8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+ /*transpose=*/true);
+ data += 64;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst8 columns in parallel.
+ Adst8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d adst8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_8, false>(data, tx_width,
+ /*transpose=*/false);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d adst16 rows in parallel.
+ Adst16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+ } else {
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 8 1d adst16 rows in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+ row_shift);
+ src += 128;
+ i -= 8;
+ } while (i != 0);
+ }
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst16 columns in parallel.
+ Adst16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+ /*row_shift=*/0);
+ } else {
+ int i = tx_width;
+ auto* data = src;
+ do {
+ // Process 8 1d adst16 columns in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_8, false>(
+ data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+ if (tx_height < 16) {
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON<false>(src, /*step=*/4);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+ } else {
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON<true>(src, /*step=*/4);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+ }
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A.
+ if ((tx_height & 0x18) != 0) {
+ return;
+ }
+ if (tx_height == 32) {
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row32_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row4_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = adjusted_tx_height;
+ do {
+ Identity16Row_NEON(src, /*step=*/16, kTransformRowShift[tx_size]);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ const int tx_height = kTransformHeight[tx_size];
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
+ return;
+ }
+
+ // Process kTransformSize32x16. The src is always rounded before the
+ // identity transform and shifted by 1 afterwards.
+ auto* src = static_cast<int16_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = adjusted_tx_height;
+ do {
+ Identity32Row16_NEON(src, /*step=*/32);
+ src += 128;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/, void* /*src_buffer*/,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+ // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
+ // Process 4 1d wht4 rows and columns in parallel.
+ const auto* src = static_cast<int16_t*>(src_buffer);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ uint8_t* dst = frame[start_y] + start_x;
+ const int dst_stride = frame.columns();
+ Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_NEON;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_NEON;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ Identity32TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ Identity32TransformLoopColumn_NEON;
+
+ // Maximum transform size for Wht is 4.
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ Wht4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ Wht4TransformLoopColumn_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
new file mode 100644
index 0000000..af647e8
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_neon.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
new file mode 100644
index 0000000..146c983
--- /dev/null
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -0,0 +1,1190 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
+ const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
+ return vorr_u8(a, RightShift<32>(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t outer_thresh) {
+ const uint8x8x2_t a = Interleave32(p0q0, p1q1);
+ const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
+ const uint8x8_t p0q0_double = vqadd_u8(b, b);
+ const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
+ const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
+ return vcle_u8(c, vdup_n_u8(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// OuterThreshhold()
+inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
+ const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t inner_thresh,
+ const uint8_t outer_thresh) {
+ const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
+ const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
+ const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+ return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t hev_thresh, const uint8_t outer_thresh,
+ const uint8_t inner_thresh, uint8x8_t* const hev_mask,
+ uint8x8_t* const needs_filter4_mask) {
+ const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+ // This includes cases where NeedsFilter4() is not true and so Filter2() will
+ // not be applied.
+ const uint8x8_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+ *needs_filter4_mask =
+ NeedsFilter4(p0p1_q0q1, p0q0, p1q1, inner_thresh, outer_thresh);
+
+ // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+ *hev_mask = vand_u8(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
+ const uint8x8_t hev_mask, uint8x8_t* const p1q1_result,
+ uint8x8_t* const p0q0_result) {
+ const int16x4_t zero = vdup_n_s16(0);
+
+ // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubl_u8(q0p1, p0q1));
+ const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+ // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+ const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+ const int8x8_t p1mq1_saturated = vqmovn_s16(vcombine_s16(p1mq1, zero));
+ const int8x8_t hev_option =
+ vand_s8(vreinterpret_s8_u8(hev_mask), p1mq1_saturated);
+
+ const int16x4_t a =
+ vget_low_s16(vaddw_s8(vcombine_s16(q0mp0_3, zero), hev_option));
+
+ // We can not shift with rounding because the clamp comes *before* the
+ // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+ // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int16x4_t plus_four = vadd_s16(a, vdup_n_s16(4));
+ const int16x4_t plus_three = vadd_s16(a, vdup_n_s16(3));
+ const int8x8_t a2_a1 =
+ vshr_n_s8(vqmovn_s16(vcombine_s16(plus_three, plus_four)), 3);
+
+ // a3 is in the high 4 values.
+ // a3 = (a1 + 1) >> 1;
+ const int8x8_t a3 = vrshr_n_s8(a2_a1, 1);
+
+ const int16x8_t p0q1_l = vreinterpretq_s16_u16(vmovl_u8(p0q1));
+ const int16x8_t q0p1_l = vreinterpretq_s16_u16(vmovl_u8(q0p1));
+
+ const int16x8_t p1q1_l =
+ vcombine_s16(vget_high_s16(q0p1_l), vget_high_s16(p0q1_l));
+
+ const int8x8_t a3_ma3 = InterleaveHigh32(a3, vneg_s8(a3));
+ const int16x8_t p1q1_a3 = vaddw_s8(p1q1_l, a3_ma3);
+
+ const int16x8_t p0q0_l =
+ vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
+ // Need to shift the second term or we end up with a2_ma2.
+ const int8x8_t a2_ma1 =
+ InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
+ const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
+
+ *p1q1_result = vqmovun_s16(p1q1_a3);
+ *p0q0_result = vqmovun_s16(p0q0_a);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t p1_v = Load4(dst - 2 * stride);
+ const uint8x8_t p0_v = Load4(dst - stride);
+ const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+ const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+
+ uint8x8_t hev_mask;
+ uint8x8_t needs_filter4_mask;
+ Filter4Masks(p0q0, p1q1, hev_thresh, outer_thresh, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+ // Copy the masks to the high bits for packed comparisons later.
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+ needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+ // This provides a good speedup for the unit test. Not sure how applicable it
+ // is to valid streams though.
+ // Consider doing this on armv7 if there is a quick way to check if a vector
+ // is zero.
+ if (vaddv_u8(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint8x8_t p0q0_output = vbsl_u8(needs_filter4_mask, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+ const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1);
+
+ StoreLo4(dst - 2 * stride, p1q1_output);
+ StoreLo4(dst - stride, p0q0_output);
+ StoreHi4(dst, p0q0_output);
+ StoreHi4(dst + stride, p1q1_output);
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ // Move |dst| to the left side of the filter window.
+ dst -= 2;
+
+ // |p1q0| and |p0q1| are named for the values they will contain after the
+ // transpose.
+ const uint8x8_t row0 = Load4(dst);
+ uint8x8_t p1q0 = Load4<1>(dst + stride, row0);
+ const uint8x8_t row2 = Load4(dst + 2 * stride);
+ uint8x8_t p0q1 = Load4<1>(dst + 3 * stride, row2);
+
+ Transpose4x4(&p1q0, &p0q1);
+ // Rearrange.
+ const uint8x8x2_t p1q1xq0p0 = Interleave32(p1q0, Transpose32(p0q1));
+ const uint8x8x2_t p1q1xp0q0 = {p1q1xq0p0.val[0],
+ Transpose32(p1q1xq0p0.val[1])};
+
+ uint8x8_t hev_mask;
+ uint8x8_t needs_filter4_mask;
+ Filter4Masks(p1q1xp0q0.val[1], p1q1xp0q0.val[0], hev_thresh, outer_thresh,
+ inner_thresh, &hev_mask, &needs_filter4_mask);
+
+ // Copy the masks to the high bits for packed comparisons later.
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+ needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+ // This provides a good speedup for the unit test. Not sure how applicable it
+ // is to valid streams though.
+ // Consider doing this on armv7 if there is a quick way to check if a vector
+ // is zero.
+ if (vaddv_u8(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ Filter4(Transpose32(p1q0), p0q1, hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint8x8_t p0q0_output =
+ vbsl_u8(needs_filter4_mask, f_p0q0, p1q1xp0q0.val[1]);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+ const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1xp0q0.val[0]);
+
+ // Put things back in order to reverse the transpose.
+ const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+ uint8x8_t output_0 = p1p0xq1q0.val[0],
+ output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+ Transpose4x4(&output_0, &output_1);
+
+ StoreLo4(dst, output_0);
+ StoreLo4(dst + stride, output_1);
+ StoreHi4(dst + 2 * stride, output_0);
+ StoreHi4(dst + 3 * stride, output_1);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
+ const uint8x8_t abd_p0p2_q0q2) {
+ const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
+ const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
+ return vand_u8(b, RightShift<32>(b));
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+// OuterThreshhold()
+inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
+ const uint8x8_t abd_p1p2_q1q2,
+ const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t inner_thresh,
+ const uint8_t outer_thresh) {
+ const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
+ const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
+ const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+ return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter6Masks(const uint8x8_t p2q2, const uint8x8_t p1q1,
+ const uint8x8_t p0q0, const uint8_t hev_thresh,
+ const uint8_t outer_thresh, const uint8_t inner_thresh,
+ uint8x8_t* const needs_filter6_mask,
+ uint8x8_t* const is_flat3_mask,
+ uint8x8_t* const hev_mask) {
+ const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+ *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+ *is_flat3_mask = IsFlat3(p0p1_q0q1, vabd_u8(p0q0, p2q2));
+ *needs_filter6_mask = NeedsFilter6(p0p1_q0q1, vabd_u8(p1q1, p2q2), p0q0, p1q1,
+ inner_thresh, outer_thresh);
+}
+
+inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1,
+ const uint8x8_t p0q0, uint8x8_t* const p1q1_output,
+ uint8x8_t* const p0q0_output) {
+ // Sum p1 and q1 output from opposite directions
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+ // ^^^^^^^^
+ const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2);
+ uint16x8_t sum = vaddw_u8(p2q2_double, p2q2);
+
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum);
+
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum);
+
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
+ // ^^
+ const uint8x8_t q0p0 = Transpose32(p0q0);
+ sum = vaddw_u8(sum, q0p0);
+
+ *p1q1_output = vrshrn_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - (2 * p2) + q0 + q1
+ // q0 = q1 - (2 * q2) + p0 + p1
+ sum = vsubq_u16(sum, p2q2_double);
+ const uint8x8_t q1p1 = Transpose32(p1q1);
+ sum = vaddq_u16(vaddl_u8(q0p0, q1p1), sum);
+
+ *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t p2_v = Load4(dst - 3 * stride);
+ const uint8x8_t p1_v = Load4(dst - 2 * stride);
+ const uint8x8_t p0_v = Load4(dst - stride);
+ const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+ const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+ const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+
+ uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+ needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+ is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ // This provides a good speedup for the unit test. Not sure how applicable it
+ // is to valid streams though.
+ // Consider doing this on armv7 if there is a quick way to check if a vector
+ // is zero.
+ if (vaddv_u8(needs_filter6_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+ // Filter6() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f6_p1q1 = zero;
+ f6_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+ StoreLo4(dst - 2 * stride, p1q1_output);
+ StoreHi4(dst + stride, p1q1_output);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+ StoreLo4(dst - stride, p0q0_output);
+ StoreHi4(dst, p0q0_output);
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ // Move |dst| to the left side of the filter window.
+ dst -= 3;
+
+ // |p2q1|, |p1q2|, |p0xx| and |q0xx| are named for the values they will
+ // contain after the transpose.
+ // These over-read by 2 bytes. We only need 6.
+ uint8x8_t p2q1 = vld1_u8(dst);
+ uint8x8_t p1q2 = vld1_u8(dst + stride);
+ uint8x8_t p0xx = vld1_u8(dst + 2 * stride);
+ uint8x8_t q0xx = vld1_u8(dst + 3 * stride);
+
+ Transpose8x4(&p2q1, &p1q2, &p0xx, &q0xx);
+
+ const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+ const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+ const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+ const uint8x8_t p0q0 = InterleaveLow32(p0xx, q0xx);
+
+ uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+ needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+ is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ // This provides a good speedup for the unit test. Not sure how applicable it
+ // is to valid streams though.
+ // Consider doing this on armv7 if there is a quick way to check if a vector
+ // is zero.
+ if (vaddv_u8(needs_filter6_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+ // Filter6() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f6_p1q1 = zero;
+ f6_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+
+ // The six tap filter is only six taps on input. Output is limited to p1-q1.
+ dst += 1;
+ // Put things back in order to reverse the transpose.
+ const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+ uint8x8_t output_0 = p1p0xq1q0.val[0];
+ uint8x8_t output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+ Transpose4x4(&output_0, &output_1);
+
+ StoreLo4(dst, output_0);
+ StoreLo4(dst + stride, output_1);
+ StoreHi4(dst + 2 * stride, output_0);
+ StoreHi4(dst + 3 * stride, output_1);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
+ const uint8x8_t abd_p0n1_q0n1,
+ const uint8x8_t abd_p0n2_q0n2) {
+ const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
+ const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
+ const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
+ return vand_u8(c, RightShift<32>(c));
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+// OuterThreshhold()
+inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
+ const uint8x8_t abd_p1p2_q1q2,
+ const uint8x8_t abd_p2p3_q2q3,
+ const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t inner_thresh,
+ const uint8_t outer_thresh) {
+ const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
+ const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
+ const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
+ const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+ return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter8Masks(const uint8x8_t p3q3, const uint8x8_t p2q2,
+ const uint8x8_t p1q1, const uint8x8_t p0q0,
+ const uint8_t hev_thresh, const uint8_t outer_thresh,
+ const uint8_t inner_thresh,
+ uint8x8_t* const needs_filter8_mask,
+ uint8x8_t* const is_flat4_mask,
+ uint8x8_t* const hev_mask) {
+ const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+ *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+ *is_flat4_mask = IsFlat4(p0p1_q0q1, vabd_u8(p0q0, p2q2), vabd_u8(p0q0, p3q3));
+ *needs_filter8_mask =
+ NeedsFilter8(p0p1_q0q1, vabd_u8(p1q1, p2q2), vabd_u8(p2q2, p3q3), p0q0,
+ p1q1, inner_thresh, outer_thresh);
+}
+
+inline void Filter8(const uint8x8_t p3q3, const uint8x8_t p2q2,
+ const uint8x8_t p1q1, const uint8x8_t p0q0,
+ uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+ uint8x8_t* const p0q0_output) {
+ // Sum p2 and q2 output from opposite directions
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^^
+ uint16x8_t sum = vaddw_u8(vaddl_u8(p3q3, p3q3), p3q3);
+
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p2q2, p2q2), sum);
+
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
+
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^
+ const uint8x8_t q0p0 = Transpose32(p0q0);
+ sum = vaddw_u8(sum, q0p0);
+
+ *p2q2_output = vrshrn_n_u16(sum, 3);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p3 - p2 + p1 + q1
+ // q1 = q2 - q3 - q2 + q0 + p1
+ sum = vsubq_u16(sum, vaddl_u8(p3q3, p2q2));
+ const uint8x8_t q1p1 = Transpose32(p1q1);
+ sum = vaddq_u16(vaddl_u8(p1q1, q1p1), sum);
+
+ *p1q1_output = vrshrn_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p3 - p1 + p0 + q2
+ // q0 = q1 - q3 - q1 + q0 + p2
+ sum = vsubq_u16(sum, vaddl_u8(p3q3, p1q1));
+ const uint8x8_t q2p2 = Transpose32(p2q2);
+ sum = vaddq_u16(vaddl_u8(p0q0, q2p2), sum);
+
+ *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t p3_v = Load4(dst - 4 * stride);
+ const uint8x8_t p2_v = Load4(dst - 3 * stride);
+ const uint8x8_t p1_v = Load4(dst - 2 * stride);
+ const uint8x8_t p0_v = Load4(dst - stride);
+ const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+ const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+ const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+ const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+
+ uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+ needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+ is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+ is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ // This provides a good speedup for the unit test. Not sure how applicable it
+ // is to valid streams though.
+ // Consider doing this on armv7 if there is a quick way to check if a vector
+ // is zero.
+ if (vaddv_u8(needs_filter8_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat4_mask) == 0) {
+ // Filter8() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f8_p2q2 = zero;
+ f8_p1q1 = zero;
+ f8_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+ const uint8x8_t p2p2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+ StoreLo4(dst - 3 * stride, p2p2_output);
+ StoreHi4(dst + 2 * stride, p2p2_output);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+ StoreLo4(dst - 2 * stride, p1q1_output);
+ StoreHi4(dst + stride, p1q1_output);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+ StoreLo4(dst - stride, p0q0_output);
+ StoreHi4(dst, p0q0_output);
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ // Move |dst| to the left side of the filter window.
+ dst -= 4;
+
+ // |p3q0|, |p2q1|, |p1q2| and |p0q3| are named for the values they will
+ // contain after the transpose.
+ uint8x8_t p3q0 = vld1_u8(dst);
+ uint8x8_t p2q1 = vld1_u8(dst + stride);
+ uint8x8_t p1q2 = vld1_u8(dst + 2 * stride);
+ uint8x8_t p0q3 = vld1_u8(dst + 3 * stride);
+
+ Transpose8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+ const uint8x8x2_t p3q3xq0p0 = Interleave32(p3q0, Transpose32(p0q3));
+ const uint8x8_t p3q3 = p3q3xq0p0.val[0];
+ const uint8x8_t p0q0 = Transpose32(p3q3xq0p0.val[1]);
+ const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+ const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+ const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+
+ uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+ needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+ is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+ is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ // This provides a good speedup for the unit test. Not sure how applicable it
+ // is to valid streams though.
+ // Consider doing this on armv7 if there is a quick way to check if a vector
+ // is zero.
+ if (vaddv_u8(needs_filter8_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat4_mask) == 0) {
+ // Filter8() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f8_p2q2 = zero;
+ f8_p1q1 = zero;
+ f8_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ // Always prepare and store p2/q2 because we need to transpose it anyway.
+ const uint8x8_t p2q2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+ // Write out p3/q3 as well. There isn't a good way to write out 6 bytes.
+ // Variable names reflect the values before transposition.
+ const uint8x8x2_t p3q0xq3p0_output =
+ Interleave32(p3q3, Transpose32(p0q0_output));
+ uint8x8_t p3q0_output = p3q0xq3p0_output.val[0];
+ uint8x8_t p0q3_output = Transpose32(p3q0xq3p0_output.val[1]);
+ const uint8x8x2_t p2q1xq2p1_output =
+ Interleave32(p2q2_output, Transpose32(p1q1_output));
+ uint8x8_t p2q1_output = p2q1xq2p1_output.val[0];
+ uint8x8_t p1q2_output = Transpose32(p2q1xq2p1_output.val[1]);
+
+ Transpose8x4(&p3q0_output, &p2q1_output, &p1q2_output, &p0q3_output);
+
+ vst1_u8(dst, p3q0_output);
+ vst1_u8(dst + stride, p2q1_output);
+ vst1_u8(dst + 2 * stride, p1q2_output);
+ vst1_u8(dst + 3 * stride, p0q3_output);
+}
+
+inline void Filter14(const uint8x8_t p6q6, const uint8x8_t p5q5,
+ const uint8x8_t p4q4, const uint8x8_t p3q3,
+ const uint8x8_t p2q2, const uint8x8_t p1q1,
+ const uint8x8_t p0q0, uint8x8_t* const p5q5_output,
+ uint8x8_t* const p4q4_output, uint8x8_t* const p3q3_output,
+ uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+ uint8x8_t* const p0q0_output) {
+ // Sum p5 and q5 output from opposite directions
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ uint16x8_t sum = vsubw_u8(vshll_n_u8(p6q6, 3), p6q6);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p5q5, p5q5), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p4q4, p4q4), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p3q3, p2q2), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^
+ const uint8x8_t q0p0 = Transpose32(p0q0);
+ sum = vaddw_u8(sum, q0p0);
+
+ *p5q5_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p4 and q4 output:
+ // p4 = p5 - (2 * p6) + p3 + q1
+ // q4 = q5 - (2 * q6) + q3 + p1
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p6q6));
+ const uint8x8_t q1p1 = Transpose32(p1q1);
+ sum = vaddq_u16(vaddl_u8(p3q3, q1p1), sum);
+
+ *p4q4_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p3 and q3 output:
+ // p3 = p4 - p6 - p5 + p2 + q2
+ // q3 = q4 - q6 - q5 + q2 + p2
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p5q5));
+ const uint8x8_t q2p2 = Transpose32(p2q2);
+ sum = vaddq_u16(vaddl_u8(p2q2, q2p2), sum);
+
+ *p3q3_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p2 and q2 output:
+ // p2 = p3 - p6 - p4 + p1 + q3
+ // q2 = q3 - q6 - q4 + q1 + p3
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p4q4));
+ const uint8x8_t q3p3 = Transpose32(p3q3);
+ sum = vaddq_u16(vaddl_u8(p1q1, q3p3), sum);
+
+ *p2q2_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p6 - p3 + p0 + q4
+ // q1 = q2 - q6 - q3 + q0 + p4
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p3q3));
+ const uint8x8_t q4p4 = Transpose32(p4q4);
+ sum = vaddq_u16(vaddl_u8(p0q0, q4p4), sum);
+
+ *p1q1_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p6 - p2 + q0 + q5
+ // q0 = q1 - q6 - q2 + p0 + p5
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p2q2));
+ const uint8x8_t q5p5 = Transpose32(p5q5);
+ sum = vaddq_u16(vaddl_u8(q0p0, q5p5), sum);
+
+ *p0q0_output = vrshrn_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t p6_v = Load4(dst - 7 * stride);
+ const uint8x8_t p5_v = Load4(dst - 6 * stride);
+ const uint8x8_t p4_v = Load4(dst - 5 * stride);
+ const uint8x8_t p3_v = Load4(dst - 4 * stride);
+ const uint8x8_t p2_v = Load4(dst - 3 * stride);
+ const uint8x8_t p1_v = Load4(dst - 2 * stride);
+ const uint8x8_t p0_v = Load4(dst - stride);
+ const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+ const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+ const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+ const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+ const uint8x8_t p4q4 = Load4<1>(dst + 4 * stride, p4_v);
+ const uint8x8_t p5q5 = Load4<1>(dst + 5 * stride, p5_v);
+ const uint8x8_t p6q6 = Load4<1>(dst + 6 * stride, p6_v);
+
+ uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+ needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+ is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+ is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ // This provides a good speedup for the unit test. Not sure how applicable it
+ // is to valid streams though.
+ // Consider doing this on armv7 if there is a quick way to check if a vector
+ // is zero.
+ if (vaddv_u8(needs_filter8_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Decide between Filter8() and Filter14().
+ uint8x8_t is_flat_outer4_mask =
+ IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+ is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+ is_flat_outer4_mask =
+ InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f8_p1q1, f8_p0q0;
+ uint8x8_t f14_p2q2, f14_p1q1, f14_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat4_mask) == 0) {
+ // Filter8() and Filter14() do not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f8_p1q1 = zero;
+ f8_p0q0 = zero;
+ f14_p1q1 = zero;
+ f14_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ uint8x8_t f8_p2q2;
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat_outer4_mask) == 0) {
+ // Filter14() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f14_p2q2 = zero;
+ f14_p1q1 = zero;
+ f14_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+ const uint8x8_t p5q5_output =
+ vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+ StoreLo4(dst - 6 * stride, p5q5_output);
+ StoreHi4(dst + 5 * stride, p5q5_output);
+
+ const uint8x8_t p4q4_output =
+ vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+ StoreLo4(dst - 5 * stride, p4q4_output);
+ StoreHi4(dst + 4 * stride, p4q4_output);
+
+ const uint8x8_t p3q3_output =
+ vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+ StoreLo4(dst - 4 * stride, p3q3_output);
+ StoreHi4(dst + 3 * stride, p3q3_output);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+ StoreLo4(dst - 3 * stride, p2q2_output);
+ StoreHi4(dst + 2 * stride, p2q2_output);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+ StoreLo4(dst - 2 * stride, p1q1_output);
+ StoreHi4(dst + stride, p1q1_output);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+ StoreLo4(dst - stride, p0q0_output);
+ StoreHi4(dst, p0q0_output);
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ dst -= 8;
+ // input
+ // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+ const uint8x16_t x0 = vld1q_u8(dst);
+ dst += stride;
+ const uint8x16_t x1 = vld1q_u8(dst);
+ dst += stride;
+ const uint8x16_t x2 = vld1q_u8(dst);
+ dst += stride;
+ const uint8x16_t x3 = vld1q_u8(dst);
+ dst -= (stride * 3);
+
+ // re-order input
+#if defined(__aarch64__)
+ const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+ const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+ const uint8x16_t index_qp7toqp0 = vcombine_u8(index_qp3toqp0, index_qp7toqp4);
+
+ uint8x16_t input_0 = vqtbl1q_u8(x0, index_qp7toqp0);
+ uint8x16_t input_1 = vqtbl1q_u8(x1, index_qp7toqp0);
+ uint8x16_t input_2 = vqtbl1q_u8(x2, index_qp7toqp0);
+ uint8x16_t input_3 = vqtbl1q_u8(x3, index_qp7toqp0);
+#else
+ const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+ const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+
+ const uint8x8_t x0_qp3qp0 = VQTbl1U8(x0, index_qp3toqp0);
+ const uint8x8_t x1_qp3qp0 = VQTbl1U8(x1, index_qp3toqp0);
+ const uint8x8_t x2_qp3qp0 = VQTbl1U8(x2, index_qp3toqp0);
+ const uint8x8_t x3_qp3qp0 = VQTbl1U8(x3, index_qp3toqp0);
+
+ const uint8x8_t x0_qp7qp4 = VQTbl1U8(x0, index_qp7toqp4);
+ const uint8x8_t x1_qp7qp4 = VQTbl1U8(x1, index_qp7toqp4);
+ const uint8x8_t x2_qp7qp4 = VQTbl1U8(x2, index_qp7toqp4);
+ const uint8x8_t x3_qp7qp4 = VQTbl1U8(x3, index_qp7toqp4);
+
+ const uint8x16_t input_0 = vcombine_u8(x0_qp3qp0, x0_qp7qp4);
+ const uint8x16_t input_1 = vcombine_u8(x1_qp3qp0, x1_qp7qp4);
+ const uint8x16_t input_2 = vcombine_u8(x2_qp3qp0, x2_qp7qp4);
+ const uint8x16_t input_3 = vcombine_u8(x3_qp3qp0, x3_qp7qp4);
+#endif
+ // input after re-order
+ // p0 p1 p2 p3 q0 q1 q2 q3 p4 p5 p6 p7 q4 q5 q6 q7
+
+ const uint8x16x2_t in01 = vtrnq_u8(input_0, input_1);
+ const uint8x16x2_t in23 = vtrnq_u8(input_2, input_3);
+ const uint16x8x2_t in02 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[0]),
+ vreinterpretq_u16_u8(in23.val[0]));
+ const uint16x8x2_t in13 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[1]),
+ vreinterpretq_u16_u8(in23.val[1]));
+
+ const uint8x8_t p0q0 = vget_low_u8(vreinterpretq_u8_u16(in02.val[0]));
+ const uint8x8_t p1q1 = vget_low_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+ const uint8x8_t p2q2 = vget_low_u8(vreinterpretq_u8_u16(in02.val[1]));
+ const uint8x8_t p3q3 = vget_low_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+ const uint8x8_t p4q4 = vget_high_u8(vreinterpretq_u8_u16(in02.val[0]));
+ const uint8x8_t p5q5 = vget_high_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+ const uint8x8_t p6q6 = vget_high_u8(vreinterpretq_u8_u16(in02.val[1]));
+ const uint8x8_t p7q7 = vget_high_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+ uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+ needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+ is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+ is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ // This provides a good speedup for the unit test. Not sure how applicable it
+ // is to valid streams though.
+ // Consider doing this on armv7 if there is a quick way to check if a vector
+ // is zero.
+ if (vaddv_u8(needs_filter8_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Decide between Filter8() and Filter14().
+ uint8x8_t is_flat_outer4_mask =
+ IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+ is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+ is_flat_outer4_mask =
+ InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+ uint8x8_t f_p0q0, f_p1q1;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t p1q1_output, p0q0_output;
+ uint8x8_t p5q5_output, p4q4_output, p3q3_output, p2q2_output;
+
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat4_mask) == 0) {
+ // Filter8() and Filter14() do not apply.
+ p1q1_output = p1q1;
+ p0q0_output = p0q0;
+
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ } else {
+#endif // defined(__aarch64__)
+ uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat_outer4_mask) == 0) {
+ // Filter14() does not apply.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = f8_p2q2;
+ p1q1_output = f8_p1q1;
+ p0q0_output = f8_p0q0;
+ } else {
+#endif // defined(__aarch64__)
+ uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+ p5q5_output = vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+ p4q4_output = vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+ p3q3_output = vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+ p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+ p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+ p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+ p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+ p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+ const uint8x16_t p0q0_p4q4 = vcombine_u8(p0q0_output, p4q4_output);
+ const uint8x16_t p2q2_p6q6 = vcombine_u8(p2q2_output, p6q6);
+ const uint8x16_t p1q1_p5q5 = vcombine_u8(p1q1_output, p5q5_output);
+ const uint8x16_t p3q3_p7q7 = vcombine_u8(p3q3_output, p7q7);
+
+ const uint16x8x2_t out02 = vtrnq_u16(vreinterpretq_u16_u8(p0q0_p4q4),
+ vreinterpretq_u16_u8(p2q2_p6q6));
+ const uint16x8x2_t out13 = vtrnq_u16(vreinterpretq_u16_u8(p1q1_p5q5),
+ vreinterpretq_u16_u8(p3q3_p7q7));
+ const uint8x16x2_t out01 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[0]),
+ vreinterpretq_u8_u16(out13.val[0]));
+ const uint8x16x2_t out23 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[1]),
+ vreinterpretq_u8_u16(out13.val[1]));
+
+#if defined(__aarch64__)
+ const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+ const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+ const uint8x16_t index_p7toq7 = vcombine_u8(index_p7top0, index_q7toq0);
+
+ const uint8x16_t output_0 = vqtbl1q_u8(out01.val[0], index_p7toq7);
+ const uint8x16_t output_1 = vqtbl1q_u8(out01.val[1], index_p7toq7);
+ const uint8x16_t output_2 = vqtbl1q_u8(out23.val[0], index_p7toq7);
+ const uint8x16_t output_3 = vqtbl1q_u8(out23.val[1], index_p7toq7);
+#else
+ const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+ const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+
+ const uint8x8_t x0_p7p0 = VQTbl1U8(out01.val[0], index_p7top0);
+ const uint8x8_t x1_p7p0 = VQTbl1U8(out01.val[1], index_p7top0);
+ const uint8x8_t x2_p7p0 = VQTbl1U8(out23.val[0], index_p7top0);
+ const uint8x8_t x3_p7p0 = VQTbl1U8(out23.val[1], index_p7top0);
+
+ const uint8x8_t x0_q7q0 = VQTbl1U8(out01.val[0], index_q7toq0);
+ const uint8x8_t x1_q7q0 = VQTbl1U8(out01.val[1], index_q7toq0);
+ const uint8x8_t x2_q7q0 = VQTbl1U8(out23.val[0], index_q7toq0);
+ const uint8x8_t x3_q7q0 = VQTbl1U8(out23.val[1], index_q7toq0);
+
+ const uint8x16_t output_0 = vcombine_u8(x0_p7p0, x0_q7q0);
+ const uint8x16_t output_1 = vcombine_u8(x1_p7p0, x1_q7q0);
+ const uint8x16_t output_2 = vcombine_u8(x2_p7p0, x2_q7q0);
+ const uint8x16_t output_3 = vcombine_u8(x3_p7p0, x3_q7q0);
+#endif
+
+ vst1q_u8(dst, output_0);
+ dst += stride;
+ vst1q_u8(dst, output_1);
+ dst += stride;
+ vst1q_u8(dst, output_2);
+ dst += stride;
+ vst1q_u8(dst, output_3);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Horizontal4_NEON;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Horizontal6_NEON;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Horizontal8_NEON;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Horizontal14_NEON;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Vertical14_NEON;
+}
+} // namespace
+} // namespace low_bitdepth
+
+void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h
new file mode 100644
index 0000000..5f79200
--- /dev/null
+++ b/src/dsp/arm/loop_filter_neon.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
new file mode 100644
index 0000000..337c9b4
--- /dev/null
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -0,0 +1,1901 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+ return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+ return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+ const RestorationUnitInfo& restoration_info, const int direction,
+ int16_t filter[4]) {
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ for (int i = 0; i < 4; ++i) {
+ filter[i] = restoration_info.wiener_info.filter[direction][i];
+ }
+ if (direction == WienerInfo::kHorizontal) {
+ filter[3] -= 128;
+ }
+}
+
+inline int16x8_t WienerHorizontal2(const uint8x8_t s0, const uint8x8_t s1,
+ const int16_t filter, const int16x8_t sum) {
+ const int16x8_t ss = vreinterpretq_s16_u16(vaddl_u8(s0, s1));
+ return vmlaq_n_s16(sum, ss, filter);
+}
+
+inline int16x8x2_t WienerHorizontal2(const uint8x16_t s0, const uint8x16_t s1,
+ const int16_t filter,
+ const int16x8x2_t sum) {
+ int16x8x2_t d;
+ d.val[0] =
+ WienerHorizontal2(vget_low_u8(s0), vget_low_u8(s1), filter, sum.val[0]);
+ d.val[1] =
+ WienerHorizontal2(vget_high_u8(s0), vget_high_u8(s1), filter, sum.val[1]);
+ return d;
+}
+
+inline void WienerHorizontalSum(const uint8x8_t s[3], const int16_t filter[4],
+ int16x8_t sum, int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[2]));
+ const int16x8_t s_1 = ZeroExtend(s[1]);
+ sum = vmlaq_n_s16(sum, s_0_2, filter[2]);
+ sum = vmlaq_n_s16(sum, s_1, filter[3]);
+ // Calculate scaled down offset correction, and add to sum here to prevent
+ // signed 16 bit outranging.
+ sum = vrsraq_n_s16(vshlq_n_s16(s_1, 7 - kInterRoundBitsHorizontal), sum,
+ kInterRoundBitsHorizontal);
+ sum = vmaxq_s16(sum, vdupq_n_s16(-offset));
+ sum = vminq_s16(sum, vdupq_n_s16(limit - offset));
+ vst1q_s16(wiener_buffer, sum);
+}
+
+inline void WienerHorizontalSum(const uint8x16_t src[3],
+ const int16_t filter[4], int16x8x2_t sum,
+ int16_t* const wiener_buffer) {
+ uint8x8_t s[3];
+ s[0] = vget_low_u8(src[0]);
+ s[1] = vget_low_u8(src[1]);
+ s[2] = vget_low_u8(src[2]);
+ WienerHorizontalSum(s, filter, sum.val[0], wiener_buffer);
+ s[0] = vget_high_u8(src[0]);
+ s[1] = vget_high_u8(src[1]);
+ s[2] = vget_high_u8(src[2]);
+ WienerHorizontalSum(s, filter, sum.val[1], wiener_buffer + 8);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint8_t* src_ptr = src;
+ uint8x16_t s[8];
+ s[0] = vld1q_u8(src_ptr);
+ ptrdiff_t x = width;
+ do {
+ src_ptr += 16;
+ s[7] = vld1q_u8(src_ptr);
+ s[1] = vextq_u8(s[0], s[7], 1);
+ s[2] = vextq_u8(s[0], s[7], 2);
+ s[3] = vextq_u8(s[0], s[7], 3);
+ s[4] = vextq_u8(s[0], s[7], 4);
+ s[5] = vextq_u8(s[0], s[7], 5);
+ s[6] = vextq_u8(s[0], s[7], 6);
+ int16x8x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+ sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+ sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+ WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+ s[0] = s[7];
+ *wiener_buffer += 16;
+ x -= 16;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint8_t* src_ptr = src;
+ uint8x16_t s[6];
+ s[0] = vld1q_u8(src_ptr);
+ ptrdiff_t x = width;
+ do {
+ src_ptr += 16;
+ s[5] = vld1q_u8(src_ptr);
+ s[1] = vextq_u8(s[0], s[5], 1);
+ s[2] = vextq_u8(s[0], s[5], 2);
+ s[3] = vextq_u8(s[0], s[5], 3);
+ s[4] = vextq_u8(s[0], s[5], 4);
+ int16x8x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+ sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+ WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+ s[0] = s[5];
+ *wiener_buffer += 16;
+ x -= 16;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint8_t* src_ptr = src;
+ uint8x16_t s[4];
+ s[0] = vld1q_u8(src_ptr);
+ ptrdiff_t x = width;
+ do {
+ src_ptr += 16;
+ s[3] = vld1q_u8(src_ptr);
+ s[1] = vextq_u8(s[0], s[3], 1);
+ s[2] = vextq_u8(s[0], s[3], 2);
+ int16x8x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+ WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+ s[0] = s[3];
+ *wiener_buffer += 16;
+ x -= 16;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint8_t* src_ptr = src;
+ ptrdiff_t x = width;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x8_t s0 = vget_low_u8(s);
+ const uint8x8_t s1 = vget_high_u8(s);
+ const int16x8_t d0 = vreinterpretq_s16_u16(vshll_n_u8(s0, 4));
+ const int16x8_t d1 = vreinterpretq_s16_u16(vshll_n_u8(s1, 4));
+ vst1q_s16(*wiener_buffer + 0, d0);
+ vst1q_s16(*wiener_buffer + 8, d1);
+ src_ptr += 16;
+ *wiener_buffer += 16;
+ x -= 16;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+ const int16_t filter,
+ const int32x4x2_t sum) {
+ const int16x8_t a = vaddq_s16(a0, a1);
+ int32x4x2_t d;
+ d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a), filter);
+ d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a), filter);
+ return d;
+}
+
+inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+ const int32x4x2_t sum) {
+ int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+ d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+ d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+ const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+ const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+ return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16));
+}
+
+inline uint8x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[7]) {
+ int32x4x2_t sum;
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+ a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[0], a[6], filter[0], sum);
+ sum = WienerVertical2(a[1], a[5], filter[1], sum);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+ return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap7Kernel2(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[8];
+ int32x4x2_t sum;
+ uint8x8x2_t d;
+ d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[1], a[7], filter[0], sum);
+ sum = WienerVertical2(a[2], a[6], filter[1], sum);
+ d.val[1] = WienerVertical(a + 3, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint8_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint8x8x2_t d[2];
+ d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+ vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[7];
+ const uint8x8_t d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+ const uint8x8_t d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u8(dst, vcombine_u8(d0, d1));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline uint8x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[5]) {
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[0], a[4], filter[1], sum);
+ return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap5Kernel2(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[6];
+ int32x4x2_t sum;
+ uint8x8x2_t d;
+ d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[1], a[5], filter[1], sum);
+ d.val[1] = WienerVertical(a + 2, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint8_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint8x8x2_t d[2];
+ d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+ vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[5];
+ const uint8x8_t d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+ const uint8x8_t d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u8(dst, vcombine_u8(d0, d1));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline uint8x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[3]) {
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ return WienerVertical(a, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap3Kernel2(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[4];
+ int32x4x2_t sum;
+ uint8x8x2_t d;
+ d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ d.val[1] = WienerVertical(a + 1, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint8_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint8x8x2_t d[2];
+ d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+ vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[3];
+ const uint8x8_t d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+ const uint8x8_t d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u8(dst, vcombine_u8(d0, d1));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+ const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+ const uint8x8_t d0 = vqrshrun_n_s16(a0, 4);
+ const uint8x8_t d1 = vqrshrun_n_s16(a1, 4);
+ vst1q_u8(dst, vcombine_u8(d0, d1));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint8_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+ WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer, dst);
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border, const ptrdiff_t stride,
+ const int width, const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+ int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+ int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+ PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+ filter_horizontal);
+ PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+ filter_vertical);
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) {
+ dst[0] = VshrU128<0>(src);
+ dst[1] = VshrU128<1>(src);
+ dst[2] = VshrU128<2>(src);
+}
+
+inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
+ uint16x4_t high[3]) {
+ uint16x8_t s[3];
+ s[0] = VshrU128<0>(src);
+ s[1] = VshrU128<2>(src);
+ s[2] = VshrU128<4>(src);
+ low[0] = vget_low_u16(s[0]);
+ low[1] = vget_low_u16(s[1]);
+ low[2] = vget_low_u16(s[2]);
+ high[0] = vget_high_u16(s[0]);
+ high[1] = vget_high_u16(s[1]);
+ high[2] = vget_high_u16(s[2]);
+}
+
+inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
+ dst[0] = VshrU128<0>(src);
+ dst[1] = VshrU128<1>(src);
+ dst[2] = VshrU128<2>(src);
+ dst[3] = VshrU128<3>(src);
+ dst[4] = VshrU128<4>(src);
+}
+
+inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5],
+ uint16x4_t high[5]) {
+ Prepare3_16(src, low, high);
+ const uint16x8_t s3 = VshrU128<6>(src);
+ const uint16x8_t s4 = VshrU128<8>(src);
+ low[3] = vget_low_u16(s3);
+ low[4] = vget_low_u16(s4);
+ high[3] = vget_high_u16(s3);
+ high[4] = vget_high_u16(s4);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+ const uint16x8_t src2) {
+ const uint16x8_t sum = vaddq_u16(src0, src1);
+ return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+ const uint32x4_t src2) {
+ const uint32x4_t sum = vaddq_u32(src0, src1);
+ return vaddq_u32(sum, src2);
+}
+
+inline uint32x4x2_t Sum3_32(const uint32x4x2_t src[3]) {
+ uint32x4x2_t d;
+ d.val[0] = Sum3_32(src[0].val[0], src[1].val[0], src[2].val[0]);
+ d.val[1] = Sum3_32(src[0].val[1], src[1].val[1], src[2].val[1]);
+ return d;
+}
+
+inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(src[0], src[1]);
+ return vaddw_u8(sum, src[2]);
+}
+
+inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
+ const uint32x4_t sum = vaddl_u16(src[0], src[1]);
+ return vaddw_u16(sum, src[2]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+ const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+ const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src0, const uint32x4_t src1,
+ const uint32x4_t src2, const uint32x4_t src3,
+ const uint32x4_t src4) {
+ const uint32x4_t sum01 = vaddq_u32(src0, src1);
+ const uint32x4_t sum23 = vaddq_u32(src2, src3);
+ const uint32x4_t sum = vaddq_u32(sum01, sum23);
+ return vaddq_u32(sum, src4);
+}
+
+inline uint32x4x2_t Sum5_32(const uint32x4x2_t src[5]) {
+ uint32x4x2_t d;
+ d.val[0] = Sum5_32(src[0].val[0], src[1].val[0], src[2].val[0], src[3].val[0],
+ src[4].val[0]);
+ d.val[1] = Sum5_32(src[0].val[1], src[1].val[1], src[2].val[1], src[3].val[1],
+ src[4].val[1]);
+ return d;
+}
+
+inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
+ const uint32x4_t sum01 = vaddl_u16(src[0], src[1]);
+ const uint32x4_t sum23 = vaddl_u16(src[2], src[3]);
+ const uint32x4_t sum0123 = vaddq_u32(sum01, sum23);
+ return vaddw_u16(sum0123, src[4]);
+}
+
+inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) {
+ uint8x8_t s[3];
+ Prepare3_8(src, s);
+ return Sum3W_16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
+ uint16x4_t low[3], high[3];
+ uint32x4x2_t sum;
+ Prepare3_16(src, low, high);
+ sum.val[0] = Sum3W_32(low);
+ sum.val[1] = Sum3W_32(high);
+ return sum;
+}
+
+inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
+ uint8x8_t s[5];
+ Prepare5_8(src, s);
+ const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
+ const uint16x8_t sum23 = vaddl_u8(s[2], s[3]);
+ const uint16x8_t sum0123 = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum0123, s[4]);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
+ uint16x4_t low[5], high[5];
+ Prepare5_16(src, low, high);
+ uint32x4x2_t sum;
+ sum.val[0] = Sum5W_32(low);
+ sum.val[1] = Sum5W_32(high);
+ return sum;
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+ uint32x4_t* const row_sq5) {
+ const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+ const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+ *row_sq3 = vaddw_u16(sum12, src[3]);
+ *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq,
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ uint8x8_t s[5];
+ Prepare5_8(src, s);
+ const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
+ const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
+ *row3 = vaddw_u8(sum12, s[3]);
+ *row5 = vaddq_u16(sum04, *row3);
+ uint16x4_t low[5], high[5];
+ Prepare5_16(sq, low, high);
+ SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
+ SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
+}
+
+inline uint16x8_t Sum343(const uint8x8x2_t src) {
+ uint8x8_t s[3];
+ Prepare3_8(src, s);
+ const uint16x8_t sum = Sum3W_16(s);
+ const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+ return vaddw_u8(sum3, s[1]);
+}
+
+inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
+ const uint32x4_t sum = Sum3W_32(src);
+ const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+ return vaddw_u16(sum3, src[1]);
+}
+
+inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
+ uint16x4_t low[3], high[3];
+ uint32x4x2_t d;
+ Prepare3_16(src, low, high);
+ d.val[0] = Sum343W(low);
+ d.val[1] = Sum343W(high);
+ return d;
+}
+
+inline uint16x8_t Sum565(const uint8x8x2_t src) {
+ uint8x8_t s[3];
+ Prepare3_8(src, s);
+ const uint16x8_t sum = Sum3W_16(s);
+ const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+ const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+ return vaddw_u8(sum5, s[1]);
+}
+
+inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
+ const uint32x4_t sum = Sum3W_32(src);
+ const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+ const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+ return vaddw_u16(sum5, src[1]);
+}
+
+inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
+ uint16x4_t low[3], high[3];
+ uint32x4x2_t d;
+ Prepare3_16(src, low, high);
+ d.val[0] = Sum565W(low);
+ d.val[1] = Sum565W(high);
+ return d;
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const int height, const ptrdiff_t sum_stride, uint16_t* sum3,
+ uint16_t* sum5, uint32_t* square_sum3,
+ uint32_t* square_sum5) {
+ int y = height;
+ do {
+ uint8x8x2_t s;
+ uint16x8x2_t sq;
+ s.val[0] = vld1_u8(src);
+ sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ ptrdiff_t x = 0;
+ do {
+ uint16x8_t row3, row5;
+ uint32x4x2_t row_sq3, row_sq5;
+ s.val[1] = vld1_u8(src + x + 8);
+ sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+ SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
+ vst1q_u16(sum3, row3);
+ vst1q_u16(sum5, row5);
+ vst1q_u32(square_sum3 + 0, row_sq3.val[0]);
+ vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
+ vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
+ vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
+ s.val[0] = s.val[1];
+ sq.val[0] = sq.val[1];
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ x += 8;
+ } while (x < sum_stride);
+ src += src_stride;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const int height, const ptrdiff_t sum_stride, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int y = height;
+ do {
+ uint8x8x2_t s;
+ uint16x8x2_t sq;
+ s.val[0] = vld1_u8(src);
+ sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ ptrdiff_t x = 0;
+ do {
+ uint16x8_t row;
+ uint32x4x2_t row_sq;
+ s.val[1] = vld1_u8(src + x + 8);
+ sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+ if (size == 3) {
+ row = Sum3Horizontal(s);
+ row_sq = Sum3WHorizontal(sq);
+ } else {
+ row = Sum5Horizontal(s);
+ row_sq = Sum5WHorizontal(sq);
+ }
+ vst1q_u16(sums, row);
+ vst1q_u32(square_sums + 0, row_sq.val[0]);
+ vst1q_u32(square_sums + 4, row_sq.val[1]);
+ s.val[0] = s.val[1];
+ sq.val[0] = sq.val[1];
+ sums += 8;
+ square_sums += 8;
+ x += 8;
+ } while (x < sum_stride);
+ src += src_stride;
+ } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+ const uint32_t scale) {
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const uint32x4_t dxd = vmull_u16(sum, sum);
+ const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+ // Ensure |p| does not underflow by using saturating subtraction.
+ const uint32x4_t p = vqsubq_u32(axn, dxd);
+ const uint32x4_t pxs = vmulq_n_u32(p, scale);
+ // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+ // is 20.
+ const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+ return vmovn_u32(shifted);
+}
+
+template <int n>
+inline void CalculateIntermediate(const uint16x8_t sum,
+ const uint32x4x2_t sum_sq,
+ const uint32_t scale, uint8x8_t* const ma,
+ uint16x8_t* const b) {
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
+ const uint16x4_t z1 =
+ CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
+ const uint16x8_t z01 = vcombine_u16(z0, z1);
+ // Using vqmovn_u16() needs an extra sign extension instruction.
+ const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
+ // Using vgetq_lane_s16() can save the sign extension instruction.
+ const uint8_t lookup[8] = {
+ kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
+ kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)],
+ kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)],
+ kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)],
+ kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)],
+ kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)],
+ kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
+ kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
+ *ma = vld1_u8(lookup);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const uint16x8_t maq = vmovl_u8(*ma);
+ const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
+ const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
+ const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+ const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+ const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
+ const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
+ *b = vcombine_u16(b_lo, b_hi);
+}
+
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+ const uint32x4x2_t sq5[5],
+ const uint32_t scale, uint8x8_t* const ma,
+ uint16x8_t* const b) {
+ const uint16x8_t sum = Sum5_16(s5);
+ const uint32x4x2_t sum_sq = Sum5_32(sq5);
+ CalculateIntermediate<25>(sum, sum_sq, scale, ma, b);
+}
+
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+ const uint32x4x2_t sq3[3],
+ const uint32_t scale, uint8x8_t* const ma,
+ uint16x8_t* const b) {
+ const uint16x8_t sum = Sum3_16(s3);
+ const uint32x4x2_t sum_sq = Sum3_32(sq3);
+ CalculateIntermediate<9>(sum, sum_sq, scale, ma, b);
+}
+
+inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+ const ptrdiff_t x, uint16x8_t* const sum_ma343,
+ uint16x8_t* const sum_ma444,
+ uint32x4x2_t* const sum_b343,
+ uint32x4x2_t* const sum_b444, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint8x8_t s[3];
+ Prepare3_8(ma3, s);
+ const uint16x8_t sum_ma111 = Sum3W_16(s);
+ *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+ const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+ *sum_ma343 = vaddw_u8(sum333, s[1]);
+ uint16x4_t low[3], high[3];
+ uint32x4x2_t sum_b111;
+ Prepare3_16(b3, low, high);
+ sum_b111.val[0] = Sum3W_32(low);
+ sum_b111.val[1] = Sum3W_32(high);
+ sum_b444->val[0] = vshlq_n_u32(sum_b111.val[0], 2);
+ sum_b444->val[1] = vshlq_n_u32(sum_b111.val[1], 2);
+ sum_b343->val[0] = vsubq_u32(sum_b444->val[0], sum_b111.val[0]);
+ sum_b343->val[1] = vsubq_u32(sum_b444->val[1], sum_b111.val[1]);
+ sum_b343->val[0] = vaddw_u16(sum_b343->val[0], low[1]);
+ sum_b343->val[1] = vaddw_u16(sum_b343->val[1], high[1]);
+ vst1q_u16(ma343 + x, *sum_ma343);
+ vst1q_u16(ma444 + x, *sum_ma444);
+ vst1q_u32(b343 + x + 0, sum_b343->val[0]);
+ vst1q_u32(b343 + x + 4, sum_b343->val[1]);
+ vst1q_u32(b444 + x + 0, sum_b444->val[0]);
+ vst1q_u32(b444 + x + 4, sum_b444->val[1]);
+}
+
+inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+ const ptrdiff_t x, uint16x8_t* const sum_ma343,
+ uint32x4x2_t* const sum_b343, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint16x8_t sum_ma444;
+ uint32x4x2_t sum_b444;
+ Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint16x8_t sum_ma343;
+ uint32x4x2_t sum_b343;
+ Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2],
+ uint8x8_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s5[5];
+ uint32x4x2_t sq5[5];
+ s[0].val[1] = vld1_u8(src0 + x + 8);
+ s[1].val[1] = vld1_u8(src1 + x + 8);
+ sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
+ sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
+ s5[3] = Sum5Horizontal(s[0]);
+ s5[4] = Sum5Horizontal(s[1]);
+ sq5[3] = Sum5WHorizontal(sq[0]);
+ sq5[4] = Sum5WHorizontal(sq[1]);
+ vst1q_u16(sum5[3] + x, s5[3]);
+ vst1q_u16(sum5[4] + x, s5[4]);
+ vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
+ s5[0] = vld1q_u16(sum5[0] + x);
+ s5[1] = vld1q_u16(sum5[1] + x);
+ s5[2] = vld1q_u16(sum5[2] + x);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
+ uint16x8_t* const b) {
+ uint16x8_t s5[5];
+ uint32x4x2_t sq5[5];
+ s->val[1] = vld1_u8(src + x + 8);
+ sq->val[1] = vmull_u8(s->val[1], s->val[1]);
+ s5[3] = s5[4] = Sum5Horizontal(*s);
+ sq5[3] = sq5[4] = Sum5WHorizontal(*sq);
+ s5[0] = vld1q_u16(sum5[0] + x);
+ s5[1] = vld1q_u16(sum5[1] + x);
+ s5[2] = vld1q_u16(sum5[2] + x);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
+ uint16x8_t* const b) {
+ uint16x8_t s3[3];
+ uint32x4x2_t sq3[3];
+ s->val[1] = vld1_u8(src + x + 8);
+ sq->val[1] = vmull_u8(s->val[1], s->val[1]);
+ s3[2] = Sum3Horizontal(*s);
+ sq3[2] = Sum3WHorizontal(*sq);
+ vst1q_u16(sum3[2] + x, s3[2]);
+ vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
+ s3[0] = vld1q_u16(sum3[0] + x);
+ s3[1] = vld1q_u16(sum3[1] + x);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0,
+ uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1,
+ uint8x8_t* const ma5, uint16x8_t* const b5) {
+ uint16x8_t s3[4], s5[5];
+ uint32x4x2_t sq3[4], sq5[5];
+ s[0].val[1] = vld1_u8(src0 + x + 8);
+ s[1].val[1] = vld1_u8(src1 + x + 8);
+ sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
+ sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
+ SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x, s3[2]);
+ vst1q_u16(sum3[3] + x, s3[3]);
+ vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
+ vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
+ vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
+ vst1q_u16(sum5[3] + x, s5[3]);
+ vst1q_u16(sum5[4] + x, s5[4]);
+ vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
+ s3[0] = vld1q_u16(sum3[0] + x);
+ s3[1] = vld1q_u16(sum3[1] + x);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+ s5[0] = vld1q_u16(sum5[0] + x);
+ s5[1] = vld1q_u16(sum5[1] + x);
+ s5[2] = vld1q_u16(sum5[2] + x);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0);
+ CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1);
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3,
+ uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+ uint16x8_t s3[3], s5[5];
+ uint32x4x2_t sq3[3], sq5[5];
+ s->val[1] = vld1_u8(src + x + 8);
+ sq->val[1] = vmull_u8(s->val[1], s->val[1]);
+ SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ s5[0] = vld1q_u16(sum5[0] + x);
+ s5[1] = vld1q_u16(sum5[1] + x);
+ s5[2] = vld1q_u16(sum5[2] + x);
+ s5[4] = s5[3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ s3[0] = vld1q_u16(sum3[0] + x);
+ s3[1] = vld1q_u16(sum3[1] + x);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ uint16_t* ma565, uint32_t* b565) {
+ uint8x8x2_t s[2], mas;
+ uint16x8x2_t sq[2], bs;
+ s[0].val[0] = vld1_u8(src0);
+ s[1].val[0] = vld1_u8(src1);
+ sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
+ sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
+ BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
+ &mas.val[0], &bs.val[0]);
+
+ int x = 0;
+ do {
+ s[0].val[0] = s[0].val[1];
+ s[1].val[0] = s[1].val[1];
+ sq[0].val[0] = sq[0].val[1];
+ sq[1].val[0] = sq[1].val[1];
+ BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
+ &mas.val[1], &bs.val[1]);
+ const uint16x8_t ma = Sum565(mas);
+ const uint32x4x2_t b = Sum565W(bs);
+ vst1q_u16(ma565, ma);
+ vst1q_u32(b565 + 0, b.val[0]);
+ vst1q_u32(b565 + 4, b.val[1]);
+ mas.val[0] = mas.val[1];
+ bs.val[0] = bs.val[1];
+ ma565 += 8;
+ b565 += 8;
+ x += 8;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
+ uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
+ uint8x8x2_t s, mas;
+ uint16x8x2_t sq, bs;
+ s.val[0] = vld1_u8(src);
+ sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
+ &bs.val[0]);
+
+ int x = 0;
+ do {
+ s.val[0] = s.val[1];
+ sq.val[0] = sq.val[1];
+ BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq,
+ &mas.val[1], &bs.val[1]);
+ if (calculate444) {
+ Store343_444(mas, bs, 0, ma343, ma444, b343, b444);
+ ma444 += 8;
+ b444 += 8;
+ } else {
+ const uint16x8_t ma = Sum343(mas);
+ const uint32x4x2_t b = Sum343W(bs);
+ vst1q_u16(ma343, ma);
+ vst1q_u32(b343 + 0, b.val[0]);
+ vst1q_u32(b343 + 4, b.val[1]);
+ }
+ mas.val[0] = mas.val[1];
+ bs.val[0] = bs.val[1];
+ ma343 += 8;
+ b343 += 8;
+ x += 8;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565,
+ uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) {
+ uint8x8x2_t s[2];
+ uint8x8x2_t ma3[2], ma5;
+ uint16x8x2_t sq[2], b3[2], b5;
+ s[0].val[0] = vld1_u8(src0);
+ s[1].val[0] = vld1_u8(src1);
+ sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
+ sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
+ BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
+ square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
+ &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+
+ int x = 0;
+ do {
+ s[0].val[0] = s[0].val[1];
+ s[1].val[0] = s[1].val[1];
+ sq[0].val[0] = sq[0].val[1];
+ sq[1].val[0] = sq[1].val[1];
+ BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
+ square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
+ &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
+ uint16x8_t ma = Sum343(ma3[0]);
+ uint32x4x2_t b = Sum343W(b3[0]);
+ vst1q_u16(ma343[0] + x, ma);
+ vst1q_u32(b343[0] + x, b.val[0]);
+ vst1q_u32(b343[0] + x + 4, b.val[1]);
+ Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
+ ma = Sum565(ma5);
+ b = Sum565W(b5);
+ vst1q_u16(ma565, ma);
+ vst1q_u32(b565 + 0, b.val[0]);
+ vst1q_u32(b565 + 4, b.val[1]);
+ ma3[0].val[0] = ma3[0].val[1];
+ ma3[1].val[0] = ma3[1].val[1];
+ b3[0].val[0] = b3[0].val[1];
+ b3[1].val[0] = b3[1].val[1];
+ ma5.val[0] = ma5.val[1];
+ b5.val[0] = b5.val[1];
+ ma565 += 8;
+ b565 += 8;
+ x += 8;
+ } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t ma,
+ const uint32x4_t b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const int32x4_t v = vreinterpretq_s32_u32(vmlsl_u16(b, ma, src));
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return vrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint8x8_t src,
+ const uint16x8_t ma,
+ const uint32x4x2_t b) {
+ const uint16x8_t src_u16 = vmovl_u8(src);
+ const int16x4_t dst_lo =
+ FilterOutput<shift>(vget_low_u16(src_u16), vget_low_u16(ma), b.val[0]);
+ const int16x4_t dst_hi =
+ FilterOutput<shift>(vget_high_u16(src_u16), vget_high_u16(ma), b.val[1]);
+ return vcombine_s16(dst_lo, dst_hi); // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint8x8_t s,
+ uint16x8_t ma[2],
+ uint32x4x2_t b[2]) {
+ const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+ uint32x4x2_t b_sum;
+ b_sum.val[0] = vaddq_u32(b[0].val[0], b[1].val[0]);
+ b_sum.val[1] = vaddq_u32(b[0].val[1], b[1].val[1]);
+ return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
+ uint16x8_t ma[3],
+ uint32x4x2_t b[3]) {
+ const uint16x8_t ma_sum = Sum3_16(ma);
+ const uint32x4x2_t b_sum = Sum3_32(b);
+ return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2],
+ uint8_t* const dst) {
+ const int16x4_t v_lo =
+ vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const int16x4_t v_hi =
+ vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+ const int16x8_t s = ZeroExtend(src);
+ const int16x8_t d = vaddq_s16(s, vv);
+ vst1_u8(dst, vqmovun_s16(d));
+}
+
+inline void SelfGuidedDoubleMultiplier(const uint8x8_t src,
+ const int16x8_t filter[2], const int w0,
+ const int w2, uint8_t* const dst) {
+ int32x4_t v[2];
+ v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+ v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+ v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+ v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+ SelfGuidedFinal(src, v, dst);
+}
+
+inline void SelfGuidedSingleMultiplier(const uint8x8_t src,
+ const int16x8_t filter, const int w0,
+ uint8_t* const dst) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ int32x4_t v[2];
+ v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+ v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+ SelfGuidedFinal(src, v, dst);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const uint32_t scale,
+ const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
+ uint8_t* const dst) {
+ uint8x8x2_t s[2], mas;
+ uint16x8x2_t sq[2], bs;
+ s[0].val[0] = vld1_u8(src0);
+ s[1].val[0] = vld1_u8(src1);
+ sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
+ sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
+ BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
+ &mas.val[0], &bs.val[0]);
+
+ int x = 0;
+ do {
+ s[0].val[0] = s[0].val[1];
+ s[1].val[0] = s[1].val[1];
+ sq[0].val[0] = sq[0].val[1];
+ sq[1].val[0] = sq[1].val[1];
+ BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
+ &mas.val[1], &bs.val[1]);
+ uint16x8_t ma[2];
+ uint32x4x2_t b[2];
+ ma[1] = Sum565(mas);
+ b[1] = Sum565W(bs);
+ vst1q_u16(ma565[1] + x, ma[1]);
+ vst1q_u32(b565[1] + x + 0, b[1].val[0]);
+ vst1q_u32(b565[1] + x + 4, b[1].val[1]);
+ const uint8x8_t sr0 = vld1_u8(src + x);
+ const uint8x8_t sr1 = vld1_u8(src + stride + x);
+ int16x8_t p0, p1;
+ ma[0] = vld1q_u16(ma565[0] + x);
+ b[0].val[0] = vld1q_u32(b565[0] + x + 0);
+ b[0].val[1] = vld1q_u32(b565[0] + x + 4);
+ p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]);
+ SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x);
+ SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x);
+ mas.val[0] = mas.val[1];
+ bs.val[0] = bs.val[1];
+ x += 8;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(const uint8_t* const src,
+ const uint8_t* const src0, const int width,
+ const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ uint16_t* ma565, uint32_t* b565,
+ uint8_t* const dst) {
+ uint8x8x2_t s, mas;
+ uint16x8x2_t sq, bs;
+ s.val[0] = vld1_u8(src0);
+ sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq,
+ &mas.val[0], &bs.val[0]);
+
+ int x = 0;
+ do {
+ s.val[0] = s.val[1];
+ sq.val[0] = sq.val[1];
+ BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq,
+ &mas.val[1], &bs.val[1]);
+ uint16x8_t ma[2];
+ uint32x4x2_t b[2];
+ ma[1] = Sum565(mas);
+ b[1] = Sum565W(bs);
+ mas.val[0] = mas.val[1];
+ bs.val[0] = bs.val[1];
+ ma[0] = vld1q_u16(ma565);
+ b[0].val[0] = vld1q_u32(b565 + 0);
+ b[0].val[1] = vld1q_u32(b565 + 4);
+ const uint8x8_t sr = vld1_u8(src + x);
+ const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b);
+ SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
+ ma565 += 8;
+ b565 += 8;
+ x += 8;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const uint32_t scale, const int16_t w0, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], uint16_t* const ma343[3],
+ uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
+ uint8_t* const dst) {
+ uint8x8x2_t s, mas;
+ uint16x8x2_t sq, bs;
+ s.val[0] = vld1_u8(src0);
+ sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
+ &bs.val[0]);
+
+ int x = 0;
+ do {
+ s.val[0] = s.val[1];
+ sq.val[0] = sq.val[1];
+ BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq,
+ &mas.val[1], &bs.val[1]);
+ uint16x8_t ma[3];
+ uint32x4x2_t b[3];
+ Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const uint8x8_t sr = vld1_u8(src + x);
+ ma[0] = vld1q_u16(ma343[0] + x);
+ ma[1] = vld1q_u16(ma444[0] + x);
+ b[0].val[0] = vld1q_u32(b343[0] + x + 0);
+ b[0].val[1] = vld1q_u32(b343[0] + x + 4);
+ b[1].val[0] = vld1q_u32(b444[0] + x + 0);
+ b[1].val[1] = vld1q_u32(b444[0] + x + 4);
+ const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b);
+ SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
+ mas.val[0] = mas.val[1];
+ bs.val[0] = bs.val[1];
+ x += 8;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ uint8x8x2_t s[2], ma3[2], ma5;
+ uint16x8x2_t sq[2], b3[2], b5;
+ s[0].val[0] = vld1_u8(src0);
+ s[1].val[0] = vld1_u8(src1);
+ sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
+ sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
+ BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
+ square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
+ &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+
+ int x = 0;
+ do {
+ s[0].val[0] = s[0].val[1];
+ s[1].val[0] = s[1].val[1];
+ sq[0].val[0] = sq[0].val[1];
+ sq[1].val[0] = sq[1].val[1];
+ BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
+ square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
+ &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
+ uint16x8_t ma[3][3];
+ uint32x4x2_t b[3][3];
+ Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565(ma5);
+ b[0][1] = Sum565W(b5);
+ vst1q_u16(ma565[1] + x, ma[0][1]);
+ vst1q_u32(b565[1] + x, b[0][1].val[0]);
+ vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
+ ma3[0].val[0] = ma3[0].val[1];
+ ma3[1].val[0] = ma3[1].val[1];
+ b3[0].val[0] = b3[0].val[1];
+ b3[1].val[0] = b3[1].val[1];
+ ma5.val[0] = ma5.val[1];
+ b5.val[0] = b5.val[1];
+ int16x8_t p[2][2];
+ const uint8x8_t sr0 = vld1_u8(src + x);
+ const uint8x8_t sr1 = vld1_u8(src + stride + x);
+ ma[0][0] = vld1q_u16(ma565[0] + x);
+ b[0][0].val[0] = vld1q_u32(b565[0] + x);
+ b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
+ p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x);
+ ma[1][1] = vld1q_u16(ma444[0] + x);
+ b[1][0].val[0] = vld1q_u32(b343[0] + x);
+ b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
+ b[1][1].val[0] = vld1q_u32(b444[0] + x);
+ b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
+ p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]);
+ ma[2][0] = vld1q_u16(ma343[1] + x);
+ b[2][0].val[0] = vld1q_u32(b343[1] + x);
+ b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
+ p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]);
+ SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x);
+ SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x);
+ x += 8;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ uint8x8x2_t s, ma3, ma5;
+ uint16x8x2_t sq, b3, b5;
+ uint16x8_t ma[3];
+ uint32x4x2_t b[3];
+ s.val[0] = vld1_u8(src0);
+ sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3,
+ square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0],
+ &b3.val[0], &b5.val[0]);
+
+ int x = 0;
+ do {
+ s.val[0] = s.val[1];
+ sq.val[0] = sq.val[1];
+ BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
+ square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1],
+ &b3.val[1], &b5.val[1]);
+ ma[1] = Sum565(ma5);
+ b[1] = Sum565W(b5);
+ ma5.val[0] = ma5.val[1];
+ b5.val[0] = b5.val[1];
+ ma[2] = Sum343(ma3);
+ b[2] = Sum343W(b3);
+ ma3.val[0] = ma3.val[1];
+ b3.val[0] = b3.val[1];
+ const uint8x8_t sr = vld1_u8(src + x);
+ int16x8_t p[2];
+ ma[0] = vld1q_u16(ma565[0] + x);
+ b[0].val[0] = vld1q_u32(b565[0] + x + 0);
+ b[0].val[1] = vld1q_u32(b565[0] + x + 4);
+ p[0] = CalculateFilteredOutputPass1(sr, ma, b);
+ ma[0] = vld1q_u16(ma343[0] + x);
+ ma[1] = vld1q_u16(ma444[0] + x);
+ b[0].val[0] = vld1q_u32(b343[0] + x + 0);
+ b[0].val[1] = vld1q_u32(b343[0] + x + 4);
+ b[1].val[0] = vld1q_u32(b444[0] + x + 0);
+ b[1].val[1] = vld1q_u32(b444[0] + x + 4);
+ p[1] = CalculateFilteredOutputPass2(sr, ma, b);
+ SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x);
+ x += 8;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const uint8_t* const top_border, const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0],
+ square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, ma343, ma444, ma565[0], b343, b444,
+ b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, ma343,
+ ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, ma343, ma444, ma565, b343, b444, b565,
+ dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2,
+ sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565,
+ b343, b444, b565, dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, ma565[0],
+ b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0,
+ sum5, square_sum5, ma565[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
+ nullptr, b343[0], nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
+ ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ src += 2;
+ int y = std::min(height, 2);
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, square_sum3,
+ ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const void* const top_border, const void* const bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
+ stride, width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
+ stride, width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
+ width, height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->loop_restorations[0] = WienerFilter_NEON;
+ dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/loop_restoration_neon.h b/src/dsp/arm/loop_restoration_neon.h
new file mode 100644
index 0000000..b551610
--- /dev/null
+++ b/src/dsp/arm/loop_restoration_neon.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// This function is not thread-safe.
+void LoopRestorationInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
new file mode 100644
index 0000000..084f42f
--- /dev/null
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -0,0 +1,444 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// TODO(b/150461164): Consider combining with GetInterIntraMask4x2().
+// Compound predictors use int16_t values and need to multiply long because the
+// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by
+// int8_t and accumulate into int32_t instruction.
+template <int subsampling_x, int subsampling_y>
+inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask)));
+ const int16x4_t mask_val1 = vreinterpret_s16_u16(
+ vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y))));
+ int16x8_t final_val;
+ if (subsampling_y == 1) {
+ const int16x4_t next_mask_val0 =
+ vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride)));
+ const int16x4_t next_mask_val1 =
+ vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3)));
+ final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1),
+ vcombine_s16(next_mask_val0, next_mask_val1));
+ } else {
+ final_val = vreinterpretq_s16_u16(
+ vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1))));
+ }
+ return vrshrq_n_s16(final_val, subsampling_y + 1);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const uint8x8_t mask_val0 = Load4(mask);
+ const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
+ return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask)));
+ if (subsampling_y == 1) {
+ const int16x8_t next_mask_val =
+ vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride)));
+ mask_val = vaddq_s16(mask_val, next_mask_val);
+ }
+ return vrshrq_n_s16(mask_val, 1 + subsampling_y);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const uint8x8_t mask_val = vld1_u8(mask);
+ return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
+ const int16_t* const pred_1,
+ const int16x8_t pred_mask_0,
+ const int16x8_t pred_mask_1, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+ const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const int32x4_t weighted_pred_0_lo =
+ vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+ const int32x4_t weighted_pred_0_hi =
+ vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+ const int32x4_t weighted_combo_lo = vmlal_s16(
+ weighted_pred_0_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+ const int32x4_t weighted_combo_hi =
+ vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+ vget_high_s16(pred_val_1));
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const uint8x8_t result =
+ vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+ vshrn_n_s32(weighted_combo_hi, 6)),
+ 4);
+ StoreLo4(dst, result);
+ StoreHi4(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_NEON(const int16_t* pred_0, const int16_t* pred_1,
+ const uint8_t* mask,
+ const ptrdiff_t mask_stride, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ int16x8_t pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ // TODO(b/150461164): Arm tends to do better with load(val); val += stride
+ // It may be possible to turn this into a loop with a templated height.
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_NEON(const int16_t* pred_0, const int16_t* pred_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ MaskBlending4x4_NEON<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ int y = 0;
+ do {
+ int16x8_t pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_NEON(const void* prediction_0, const void* prediction_1,
+ const ptrdiff_t /*prediction_stride_1*/,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ if (width == 4) {
+ MaskBlending4xH_NEON<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x);
+ const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x);
+ uint8x8_t result;
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const int32x4_t weighted_pred_0_lo =
+ vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+ const int32x4_t weighted_pred_0_hi =
+ vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+ const int32x4_t weighted_combo_lo =
+ vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1),
+ vget_low_s16(pred_val_1));
+ const int32x4_t weighted_combo_hi =
+ vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+ vget_high_s16(pred_val_1));
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+ vshrn_n_s32(weighted_combo_hi, 6)),
+ 4);
+ vst1_u8(dst + x, result);
+
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+// TODO(b/150461164): This is much faster for inter_intra (input is Pixel
+// values) but regresses compound versions (input is int16_t). Try to
+// consolidate these.
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ const uint8x8_t mask_val =
+ vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y)));
+ if (subsampling_y == 1) {
+ const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride),
+ vld1_u8(mask + mask_stride * 3));
+
+ // Use a saturating add to work around the case where all |mask| values
+ // are 64. Together with the rounding shift this ensures the correct
+ // result.
+ const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val);
+ return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
+ }
+
+ return vrshr_n_u8(mask_val, /*subsampling_x=*/1);
+ }
+
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const uint8x8_t mask_val0 = Load4(mask);
+ // TODO(b/150461164): Investigate the source of |mask| and see if the stride
+ // can be removed.
+ // TODO(b/150461164): The unit tests start at 8x8. Does this get run?
+ return Load4<1>(mask + mask_stride, mask_val0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetInterIntraMask8(const uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ const uint8x16_t mask_val = vld1q_u8(mask);
+ const uint8x8_t mask_paired =
+ vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val));
+ if (subsampling_y == 1) {
+ const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride);
+ const uint8x8_t next_mask_paired =
+ vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val));
+
+ // Use a saturating add to work around the case where all |mask| values
+ // are 64. Together with the rounding shift this ensures the correct
+ // result.
+ const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired);
+ return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
+ }
+
+ return vrshr_n_u8(mask_paired, /*subsampling_x=*/1);
+ }
+
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ return vld1_u8(mask);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
+ uint8_t* const pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8x8_t pred_mask_0,
+ const uint8x8_t pred_mask_1) {
+ const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+ uint8x8_t pred_val_1 = Load4(pred_1);
+ pred_val_1 = Load4<1>(pred_1 + pred_stride_1, pred_val_1);
+
+ const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+ const uint16x8_t weighted_combo =
+ vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+ const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+ StoreLo4(pred_1, result);
+ StoreHi4(pred_1 + pred_stride_1, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_NEON(const uint8_t* pred_0,
+ uint8_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* mask,
+ const ptrdiff_t mask_stride) {
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ uint8x8_t pred_mask_1 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+
+ pred_mask_1 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_NEON(
+ const uint8_t* pred_0, uint8_t* pred_1, const ptrdiff_t pred_stride_1,
+ const uint8_t* mask, const ptrdiff_t mask_stride, const int height) {
+ if (height == 4) {
+ InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ return;
+ }
+ int y = 0;
+ do {
+ InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+
+ InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend8bpp_NEON(const uint8_t* prediction_0,
+ uint8_t* prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride,
+ const int width, const int height) {
+ if (width == 4) {
+ InterIntraMaskBlending8bpp4xH_NEON<subsampling_x, subsampling_y>(
+ prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+ height);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ // TODO(b/150461164): Consider a 16 wide specialization (at least for the
+ // unsampled version) to take advantage of vld1q_u8().
+ const uint8x8_t pred_mask_1 =
+ GetInterIntraMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+ const uint8x8_t pred_val_0 = vld1_u8(prediction_0);
+ prediction_0 += 8;
+ const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x);
+ const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+ // weighted_pred0 + weighted_pred1
+ const uint16x8_t weighted_combo =
+ vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+ const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+ vst1_u8(prediction_1 + x, result);
+
+ x += 8;
+ } while (x < width);
+ prediction_1 += prediction_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0>;
+ dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1>;
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_NEON<0, 0>;
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_NEON<1, 0>;
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_NEON<1, 1>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/mask_blend_neon.h b/src/dsp/arm/mask_blend_neon.h
new file mode 100644
index 0000000..3829274
--- /dev/null
+++ b/src/dsp/arm/mask_blend_neon.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc
new file mode 100644
index 0000000..8caba7d
--- /dev/null
+++ b/src/dsp/arm/motion_field_projection_neon.cc
@@ -0,0 +1,393 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x8_t LoadDivision(const int8x8x2_t division_table,
+ const int8x8_t reference_offset) {
+ const int8x8_t kOne = vcreate_s8(0x0100010001000100);
+ const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
+ const int8x8_t t = vadd_s8(reference_offset, reference_offset);
+ const int8x8x2_t tt = vzip_s8(t, t);
+ const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
+ const int8x16_t idx = vaddq_s8(t1, kOneQ);
+ const int8x8_t idx_low = vget_low_s8(idx);
+ const int8x8_t idx_high = vget_high_s8(idx);
+ const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
+ const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
+ return vcombine_s16(d0, d1);
+}
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+ const int numerator) {
+ const int32x4_t m0 = vmull_s16(mv, denominator);
+ const int32x4_t m = vmulq_n_s32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+ return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x8_t MvProjectionClip(const int16x8_t mv,
+ const int16x8_t denominator,
+ const int numerator) {
+ const int16x4_t mv0 = vget_low_s16(mv);
+ const int16x4_t mv1 = vget_high_s16(mv);
+ const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
+ const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
+ const int16x8_t projection = vcombine_s16(s0, s1);
+ const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+ const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
+ return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
+ // Add 63 to negative delta so that it shifts towards zero.
+ const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
+ const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
+ const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
+ const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
+ const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
+ const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
+ const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
+ const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
+ return vqmovn_s16(offset2);
+}
+
+inline void GetPosition(
+ const int8x8x2_t division_table, const MotionVector* const mv,
+ const int numerator, const int x8_start, const int x8_end, const int x8,
+ const int8x8_t r_offsets, const int8x8_t source_reference_type8,
+ const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
+ const int16x8_t d_sign, const int delta, int8x8_t* const r,
+ int8x8_t* const position_y8, int8x8_t* const position_x8,
+ int64_t* const skip_64, int32x4_t mvs[2]) {
+ const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+ *r = vtbl1_s8(r_offsets, source_reference_type8);
+ const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
+ int16x8_t projection_mv[2];
+ mvs[0] = vld1q_s32(mv_int + 0);
+ mvs[1] = vld1q_s32(mv_int + 4);
+ // Deinterlace x and y components
+ const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
+ const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
+ const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
+ // numerator could be 0.
+ projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
+ projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
+ // Do not update the motion vector if the block position is not valid or
+ // if position_x8 is outside the current range of x8_start and x8_end.
+ // Note that position_y8 will always be within the range of y8_start and
+ // y8_end.
+ // After subtracting the base, valid projections are within 8-bit.
+ *position_y8 = Project_NEON(projection_mv[0], d_sign);
+ const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign);
+ const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100});
+ *position_x8 = vqadd_s8(position_x, k01234567);
+ const int8x16_t position_xy = vcombine_s8(*position_x8, *position_y8);
+ const int x8_floor = std::max(
+ x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8]
+ const int x8_ceiling = std::min(
+ x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset); // [0, 16]
+ const int8x8_t x8_floor8 = vdup_n_s8(x8_floor);
+ const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling);
+ const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8);
+ const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8);
+ const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy);
+ const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy);
+ const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow));
+ const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out));
+ const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out));
+ *skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0);
+}
+
+template <int idx>
+inline void Store(const int16x8_t position, const int8x8_t reference_offset,
+ const int32x4_t mv, int8_t* dst_reference_offset,
+ MotionVector* dst_mv) {
+ const ptrdiff_t offset = vgetq_lane_s16(position, idx);
+ auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
+ vst1q_lane_s32(d_mv, mv, idx & 3);
+ vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const int16x8_t position,
+ const int8x8_t reference_offset, const int32x4_t mv,
+ int8_t* dst_reference_offset, MotionVector* dst_mv) {
+ if (skips[idx] == 0) {
+ Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+ }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
+ const int reference_to_current_with_sign,
+ const int dst_sign, const int y8_start,
+ const int y8_end, const int x8_start,
+ const int x8_end,
+ TemporalMotionField* const motion_field) {
+ const ptrdiff_t stride = motion_field->mv.columns();
+ // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+ // coordinates in that range could end up being position_x8 because of
+ // projection.
+ const int adjusted_x8_start =
+ std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+ const int adjusted_x8_end = std::min(
+ x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+ const int leftover = adjusted_x8_end - adjusted_x8_end8;
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+ int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+ MotionVector* dst_mv = motion_field->mv[y8_start];
+ const int16x8_t d_sign = vdupq_n_s16(dst_sign);
+
+ static_assert(sizeof(int8_t) == sizeof(bool), "");
+ static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+ static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+ assert(dst_sign == 0 || dst_sign == -1);
+ assert(stride == motion_field->reference_offset.columns());
+ assert((y8_start & 7) == 0);
+ assert((adjusted_x8_start & 7) == 0);
+ // The final position calculation is represented with int16_t. Valid
+ // position_y8 from its base is at most 7. After considering the horizontal
+ // offset which is at most |stride - 1|, we have the following assertion,
+ // which means this optimization works for frame width up to 32K (each
+ // position is a 8x8 block).
+ assert(8 * stride <= 32768);
+ const int8x8_t skip_reference =
+ vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
+ const int8x8_t r_offsets = vld1_s8(reference_offsets);
+ const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
+ int8x8x2_t division_table;
+ division_table.val[0] = vget_low_s8(table);
+ division_table.val[1] = vget_high_s8(table);
+
+ int y8 = y8_start;
+ do {
+ const int y8_floor = (y8 & ~7) - y8; // [-7, 0]
+ const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8); // [1, 8]
+ const int8x8_t y8_floor8 = vdup_n_s8(y8_floor);
+ const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling);
+ int x8;
+
+ for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+ const int8x8_t source_reference_type8 =
+ vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
+ const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
+ const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+ // Early termination #1 if all are skips. Chance is typically ~30-40%.
+ if (early_skip == -1) continue;
+ int64_t skip_64;
+ int8x8_t r, position_x8, position_y8;
+ int32x4_t mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+ x8_end, x8, r_offsets, source_reference_type8, skip_r,
+ y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8,
+ &position_x8, &skip_64, mvs);
+ // Early termination #2 if all are skips.
+ // Chance is typically ~15-25% after Early termination #1.
+ if (skip_64 == -1) continue;
+ const int16x8_t p_y = vmovl_s8(position_y8);
+ const int16x8_t p_x = vmovl_s8(position_x8);
+ const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+ const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+ if (skip_64 == 0) {
+ // Store all. Chance is typically ~70-85% after Early termination #2.
+ Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // Chance is typically ~15-30% after Early termination #2.
+ // The compiler is smart enough to not create the local buffer skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ }
+ }
+
+ // The following leftover processing cannot be moved out of the do...while
+ // loop. Doing so may change the result storing orders of the same position.
+ if (leftover > 0) {
+ // Use SIMD only when leftover is at least 4, and there are at least 8
+ // elements in a row.
+ if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+ // Process the last 8 elements to avoid loading invalid memory. Some
+ // elements may have been processed in the above loop, which is OK.
+ const int delta = 8 - leftover;
+ x8 = adjusted_x8_end - 8;
+ const int8x8_t source_reference_type8 = vld1_s8(
+ reinterpret_cast<const int8_t*>(source_reference_types + x8));
+ const int8x8_t skip_r =
+ vtbl1_s8(skip_reference, source_reference_type8);
+ const int64_t early_skip =
+ vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+ // Early termination #1 if all are skips.
+ if (early_skip != -1) {
+ int64_t skip_64;
+ int8x8_t r, position_x8, position_y8;
+ int32x4_t mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign,
+ x8_start, x8_end, x8, r_offsets, source_reference_type8,
+ skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+ &position_y8, &position_x8, &skip_64, mvs);
+ // Early termination #2 if all are skips.
+ if (skip_64 != -1) {
+ const int16x8_t p_y = vmovl_s8(position_y8);
+ const int16x8_t p_x = vmovl_s8(position_x8);
+ const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+ const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+ // Store up to 7 elements since leftover is at most 7.
+ if (skip_64 == 0) {
+ // Store all.
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // The compiler is smart enough to not create the local buffer
+ // skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ }
+ }
+ }
+ } else {
+ for (; x8 < adjusted_x8_end; ++x8) {
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
+ MotionVector projection_mv;
+ // reference_to_current_with_sign could be 0.
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
+ &projection_mv);
+ // Do not update the motion vector if the block position is not valid
+ // or if position_x8 is outside the current range of x8_start and
+ // x8_end. Note that position_y8 will always be within the range of
+ // y8_start and y8_end.
+ const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+ if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+ const int x8_base = x8 & ~7;
+ const int x8_floor =
+ std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+ const int x8_ceiling =
+ std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+ const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+ if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+ dst_mv[position_y8 * stride + position_x8] = mv[x8];
+ dst_reference_offset[position_y8 * stride + position_x8] =
+ reference_offsets[source_reference_type];
+ }
+ }
+ }
+
+ source_reference_types += stride;
+ mv += stride;
+ dst_reference_offset += stride;
+ dst_mv += stride;
+ } while (++y8 < y8_end);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
+}
+#endif
+
+} // namespace
+
+void MotionFieldProjectionInit_NEON() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/motion_field_projection_neon.h b/src/dsp/arm/motion_field_projection_neon.h
new file mode 100644
index 0000000..41ab6a6
--- /dev/null
+++ b/src/dsp/arm/motion_field_projection_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
new file mode 100644
index 0000000..8a403a6
--- /dev/null
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -0,0 +1,267 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+ const int32x4_t numerator) {
+ const int32x4_t m0 = vmull_s16(mv, denominator);
+ const int32x4_t m = vmulq_s32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+ return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x4_t MvProjectionCompound(const int16x4_t mv,
+ const int temporal_reference_offsets,
+ const int reference_offsets[2]) {
+ const int16x4_t denominator =
+ vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
+ const int32x2_t offset = vld1_s32(reference_offsets);
+ const int32x2x2_t offsets = vzip_s32(offset, offset);
+ const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
+ return MvProjection(mv, denominator, numerator);
+}
+
+inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
+ const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+ const int16x8_t mv = vcombine_s16(mv0, mv1);
+ const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
+ return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int16x8_t MvProjectionCompoundClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offsets[2]) {
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const int32x2_t temporal_mv = vld1_s32(tmvs);
+ const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
+ const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
+ const int16x4_t mv0 = MvProjectionCompound(
+ tmv0, temporal_reference_offsets[0], reference_offsets);
+ const int16x4_t mv1 = MvProjectionCompound(
+ tmv1, temporal_reference_offsets[1], reference_offsets);
+ return ProjectionClip(mv0, mv1);
+}
+
+inline int16x8_t MvProjectionSingleClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets, const int reference_offset,
+ int16x4_t* const lookup) {
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const int16x8_t temporal_mv = vld1q_s16(tmvs);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
+ const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
+ const int16x4_t tmv0 = vget_low_s16(temporal_mv);
+ const int16x4_t tmv1 = vget_high_s16(temporal_mv);
+ const int32x4_t numerator = vdupq_n_s32(reference_offset);
+ const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
+ const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
+ return ProjectionClip(mv0, mv1);
+}
+
+inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
+ const int16x8_t kRoundDownMask = vdupq_n_s16(1);
+ const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+ const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+ const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
+ vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
+}
+
+inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
+ const int16x8_t kRoundDownMask = vdupq_n_s16(7);
+ const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+ const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+ const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
+ const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
+ vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
+}
+
+void MvProjectionCompoundLowPrecision_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ LowPrecision(mv, candidate_mvs);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count);
+}
+
+void MvProjectionCompoundForceInteger_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ ForceInteger(mv, candidate_mvs);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count);
+}
+
+void MvProjectionCompoundHighPrecision_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count);
+}
+
+void MvProjectionSingleLowPrecision_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ LowPrecision(mv, candidate_mvs);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count);
+}
+
+void MvProjectionSingleForceInteger_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ ForceInteger(mv, candidate_mvs);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count);
+}
+
+void MvProjectionSingleHighPrecision_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+#endif
+
+} // namespace
+
+void MotionVectorSearchInit_NEON() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/motion_vector_search_neon.h b/src/dsp/arm/motion_vector_search_neon.h
new file mode 100644
index 0000000..19b4519
--- /dev/null
+++ b/src/dsp/arm/motion_vector_search_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
new file mode 100644
index 0000000..66ad663
--- /dev/null
+++ b/src/dsp/arm/obmc_neon.cc
@@ -0,0 +1,392 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred,
+ const uint8x8_t pred_mask,
+ const uint8x8_t obmc_pred_mask) {
+ const uint8x8_t pred_val = Load4(pred);
+ const uint8x8_t obmc_pred_val = Load4(obmc_pred);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ StoreLo4(pred, result);
+}
+
+template <bool from_left>
+inline void OverlapBlend2xH_NEON(uint8_t* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int height,
+ const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8_t* obmc_pred = obmc_prediction;
+ uint8x8_t pred_mask;
+ uint8x8_t obmc_pred_mask;
+ int compute_height;
+ const int mask_offset = height - 2;
+ if (from_left) {
+ pred_mask = Load2(kObmcMask);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ compute_height = height;
+ } else {
+ // Weights for the last line are all 64, which is a no-op.
+ compute_height = height - 1;
+ }
+ uint8x8_t pred_val = vdup_n_u8(0);
+ uint8x8_t obmc_pred_val = vdup_n_u8(0);
+ int y = 0;
+ do {
+ if (!from_left) {
+ pred_mask = vdup_n_u8(kObmcMask[mask_offset + y]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ }
+ pred_val = Load2<0>(pred, pred_val);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ Store2<0>(pred, result);
+
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y != compute_height);
+}
+
+inline void OverlapBlendFromLeft4xH_NEON(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8x8_t pred_mask = Load4(kObmcMask + 2);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ y += 2;
+ } while (y != height);
+}
+
+inline void OverlapBlendFromLeft8xH_NEON(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ const uint8x8_t pred_val = vld1_u8(pred);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+
+ vst1_u8(pred, result);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y != height);
+}
+
+void OverlapBlendFromLeft_NEON(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+ if (width == 2) {
+ OverlapBlend2xH_NEON<true>(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 8) {
+ OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ const uint8x16_t mask_inverter = vdupq_n_u8(64);
+ const uint8_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ pred = static_cast<uint8_t*>(prediction) + x;
+ obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+ const uint8x16_t pred_mask = vld1q_u8(mask + x);
+ // 64 - mask
+ const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ const uint8x16_t pred_val = vld1q_u8(pred);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+ const uint16x8_t weighted_pred_lo =
+ vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
+ const uint8x8_t result_lo =
+ vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
+ vget_low_u8(obmc_pred_val)),
+ 6);
+ const uint16x8_t weighted_pred_hi =
+ vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
+ const uint8x8_t result_hi =
+ vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
+ vget_high_u8(obmc_pred_val)),
+ 6);
+ vst1q_u8(pred, vcombine_u8(result_lo, result_hi));
+
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < height);
+ x += 16;
+ } while (x < width);
+}
+
+inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction,
+ const ptrdiff_t prediction_stride,
+ const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride,
+ const int height) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ if (height == 2) {
+ return;
+ }
+
+ pred_mask = vdup_n_u8(kObmcMask[3]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(kObmcMask[4]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+}
+
+inline void OverlapBlendFromTop4xH_NEON(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ if (height < 8) {
+ OverlapBlendFromTop4x4_NEON(prediction, prediction_stride, obmc_prediction,
+ obmc_prediction_stride, height);
+ return;
+ }
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const uint8_t* mask = kObmcMask + height - 2;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ int y = 0;
+ // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
+ // lines are unchanged as the corresponding mask value is 64.
+ do {
+ uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+ uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 1]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 2]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 3]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 4]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 5]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ // Increment for the right mask index.
+ y += 6;
+ } while (y < height - 4);
+}
+
+inline void OverlapBlendFromTop8xH_NEON(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height = height - (height >> 2);
+ int y = 0;
+ do {
+ const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ const uint8x8_t pred_val = vld1_u8(pred);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+
+ vst1_u8(pred, result);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y != compute_height);
+}
+
+void OverlapBlendFromTop_NEON(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+ if (width == 2) {
+ OverlapBlend2xH_NEON<false>(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+
+ if (width == 8) {
+ OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+
+ const uint8_t* mask = kObmcMask + height - 2;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ // Stop when mask value becomes 64. This is inferred for 4xH.
+ const int compute_height = height - (height >> 2);
+ int y = 0;
+ do {
+ const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ int x = 0;
+ do {
+ const uint8x16_t pred_val = vld1q_u8(pred + x);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
+ const uint16x8_t weighted_pred_lo =
+ vmull_u8(pred_mask, vget_low_u8(pred_val));
+ const uint8x8_t result_lo =
+ vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
+ vget_low_u8(obmc_pred_val)),
+ 6);
+ const uint16x8_t weighted_pred_hi =
+ vmull_u8(pred_mask, vget_high_u8(pred_val));
+ const uint8x8_t result_hi =
+ vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
+ vget_high_u8(obmc_pred_val)),
+ 6);
+ vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));
+
+ x += 16;
+ } while (x < width);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < compute_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
+}
+
+} // namespace
+
+void ObmcInit_NEON() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/obmc_neon.h b/src/dsp/arm/obmc_neon.h
new file mode 100644
index 0000000..d5c9d9c
--- /dev/null
+++ b/src/dsp/arm/obmc_neon.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If NEON is enabled, signal the NEON implementation should be used.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
new file mode 100644
index 0000000..1680450
--- /dev/null
+++ b/src/dsp/arm/super_res_neon.cc
@@ -0,0 +1,166 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint8_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ uint8x8_t filter[8];
+ uint8x16_t d[kSuperResFilterTaps / 2];
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ filter[i] =
+ vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+ kSuperResExtraBits]);
+ }
+ Transpose8x8(filter, d);
+ vst1q_u8(dst, d[0]);
+ dst += 16;
+ vst1q_u8(dst, d[1]);
+ dst += 16;
+ vst1q_u8(dst, d[2]);
+ dst += 16;
+ vst1q_u8(dst, d[3]);
+ dst += 16;
+ } while (--x != 0);
+}
+
+// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
+// Maximum sum: 255*171 == 0xAA55
+// The sum is clipped to [0, 255], so adding all positive and then
+// subtracting all negative with saturation is sufficient.
+// 0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
+ const uint8_t** coefficients) {
+ uint8x16_t f[kSuperResFilterTaps / 2];
+ for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
+ f[i] = vld1q_u8(*coefficients);
+ }
+ uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
+ res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
+ res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
+ res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
+ uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
+ temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
+ temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
+ temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
+ res = vqsubq_u16(res, temp);
+ return vqrshrn_n_u16(res, kFilterBits);
+}
+
+void SuperRes_NEON(const void* const coefficients, void* const source,
+ const ptrdiff_t stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest) {
+ auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint8_t*>(coefficients);
+ uint8_t* dst_ptr = dst;
+ ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ uint8x8_t sr[8];
+ uint8x16_t s[8];
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ // The below code calculates up to 15 extra upscaled
+ // pixels which will over-read up to 15 downscaled pixels in the end of each
+ // row. kSuperResHorizontalBorder accounts for this.
+ do {
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+ }
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+ s[i] = vcombine_u8(sr[i], s_hi);
+ }
+ Transpose8x16(s);
+ // Do not use loop for the following 8 instructions, since the compiler
+ // will generate redundant code.
+ sr[0] = vget_low_u8(s[0]);
+ sr[1] = vget_low_u8(s[1]);
+ sr[2] = vget_low_u8(s[2]);
+ sr[3] = vget_low_u8(s[3]);
+ sr[4] = vget_low_u8(s[4]);
+ sr[5] = vget_low_u8(s[5]);
+ sr[6] = vget_low_u8(s[6]);
+ sr[7] = vget_low_u8(s[7]);
+ const uint8x8_t d0 = SuperRes(sr, &filter);
+ // Do not use loop for the following 8 instructions, since the compiler
+ // will generate redundant code.
+ sr[0] = vget_high_u8(s[0]);
+ sr[1] = vget_high_u8(s[1]);
+ sr[2] = vget_high_u8(s[2]);
+ sr[3] = vget_high_u8(s[3]);
+ sr[4] = vget_high_u8(s[4]);
+ sr[5] = vget_high_u8(s[5]);
+ sr[6] = vget_high_u8(s[6]);
+ sr[7] = vget_high_u8(s[7]);
+ const uint8x8_t d1 = SuperRes(sr, &filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
+ dst_ptr += 16;
+ } while (--x != 0);
+ src += stride;
+ dst += stride;
+ } while (--y != 0);
+}
+
+void Init8bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ dsp->super_res_coefficients = SuperResCoefficients_NEON;
+ dsp->super_res = SuperRes_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
new file mode 100644
index 0000000..f51785d
--- /dev/null
+++ b/src/dsp/arm/super_res_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
new file mode 100644
index 0000000..7a41998
--- /dev/null
+++ b/src/dsp/arm/warp_neon.cc
@@ -0,0 +1,453 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+ (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+//
+// src_row_centered contains 16 "centered" samples of a source row. (We center
+// the samples by subtracting 128 from the samples.)
+void HorizontalFilter(const int sx4, const int16_t alpha,
+ const int8x16_t src_row_centered,
+ int16_t intermediate_result_row[8]) {
+ int sx = sx4 - MultiplyBy4(alpha);
+ int8x8_t filter[8];
+ for (int x = 0; x < 8; ++x) {
+ const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ filter[x] = vld1_s8(kWarpedFilters8[offset]);
+ sx += alpha;
+ }
+ Transpose8x8(filter);
+ // Add kFirstPassOffset to ensure |sum| stays within uint16_t.
+ // Add 128 (offset) * 128 (filter sum) (also 1 << 14) to account for the
+ // centering of the source samples. These combined are 1 << 15 or -32768.
+ int16x8_t sum =
+ vdupq_n_s16(static_cast<int16_t>(kFirstPassOffset + 128 * 128));
+ // Unrolled k = 0..7 loop. We need to manually unroll the loop because the
+ // third argument (an index value) to vextq_s8() must be a constant
+ // (immediate). src_row_window is a sliding window of length 8 into
+ // src_row_centered.
+ // k = 0.
+ int8x8_t src_row_window = vget_low_s8(src_row_centered);
+ sum = vmlal_s8(sum, filter[0], src_row_window);
+ // k = 1.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 1));
+ sum = vmlal_s8(sum, filter[1], src_row_window);
+ // k = 2.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 2));
+ sum = vmlal_s8(sum, filter[2], src_row_window);
+ // k = 3.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 3));
+ sum = vmlal_s8(sum, filter[3], src_row_window);
+ // k = 4.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 4));
+ sum = vmlal_s8(sum, filter[4], src_row_window);
+ // k = 5.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 5));
+ sum = vmlal_s8(sum, filter[5], src_row_window);
+ // k = 6.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 6));
+ sum = vmlal_s8(sum, filter[6], src_row_window);
+ // k = 7.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 7));
+ sum = vmlal_s8(sum, filter[7], src_row_window);
+ // End of unrolled k = 0..7 loop.
+ // Due to the offset |sum| is guaranteed to be unsigned.
+ uint16x8_t sum_unsigned = vreinterpretq_u16_s16(sum);
+ sum_unsigned = vrshrq_n_u16(sum_unsigned, kInterRoundBitsHorizontal);
+ // After the shift |sum_unsigned| will fit into int16_t.
+ vst1q_s16(intermediate_result_row, vreinterpretq_s16_u16(sum_unsigned));
+}
+
+template <bool is_compound>
+void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
+ const int source_width, const int source_height,
+ const int* const warp_params, const int subsampling_x,
+ const int subsampling_y, const int block_start_x,
+ const int block_start_y, const int block_width,
+ const int block_height, const int16_t alpha, const int16_t beta,
+ const int16_t gamma, const int16_t delta, void* dest,
+ const ptrdiff_t dest_stride) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ union {
+ // Intermediate_result is the output of the horizontal filtering and
+ // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+ // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+ // type so that we can multiply it by kWarpedFilters (which has signed
+ // values) using vmlal_s16().
+ int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
+ // In the simple special cases where the samples in each row are all the
+ // same, store one sample per row in a column vector.
+ int16_t intermediate_result_column[15];
+ };
+
+ const auto* const src = static_cast<const uint8_t*>(source);
+ using DestType =
+ typename std::conditional<is_compound, int16_t, uint8_t>::type;
+ auto* dst = static_cast<DestType*>(dest);
+
+ assert(block_width >= 8);
+ assert(block_height >= 8);
+
+ // Warp process applies for each 8x8 block.
+ int start_y = block_start_y;
+ do {
+ int start_x = block_start_x;
+ do {
+ const int src_x = (start_x + 4) << subsampling_x;
+ const int src_y = (start_y + 4) << subsampling_y;
+ const int dst_x =
+ src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+ const int dst_y =
+ src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+ const int x4 = dst_x >> subsampling_x;
+ const int y4 = dst_y >> subsampling_y;
+ const int ix4 = x4 >> kWarpedModelPrecisionBits;
+ const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ // Regions 1 and 2.
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t row_border_pixel =
+ first_row_border[row * source_stride];
+
+ DestType* dst_row = dst + start_x - block_start_x;
+ for (int y = 0; y < 8; ++y) {
+ if (is_compound) {
+ const int16x8_t sum =
+ vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical -
+ kRoundBitsVertical));
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+ } else {
+ memset(dst_row, row_border_pixel, 8);
+ }
+ dst_row += dest_stride;
+ }
+ // End of region 1. Continue the |start_x| do-while loop.
+ start_x += 8;
+ continue;
+ }
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ int sum = first_row_border[row * source_stride];
+ sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 =
+ (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+#if defined(__aarch64__)
+ const int16x8_t intermediate =
+ vld1q_s16(&intermediate_result_column[y]);
+ int16_t tmp[8];
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]);
+ const int32x4_t product_low =
+ vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate));
+ const int32x4_t product_high =
+ vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate));
+ // vaddvq_s32 is only available on __aarch64__.
+ const int32_t sum =
+ vaddvq_s32(product_low) + vaddvq_s32(product_high);
+ const int16_t sum_descale =
+ RightShiftWithRounding(sum, kRoundBitsVertical);
+ if (is_compound) {
+ dst_row[x] = sum_descale;
+ } else {
+ tmp[x] = sum_descale;
+ }
+ sy += gamma;
+ }
+ if (!is_compound) {
+ const int16x8_t sum = vld1q_s16(tmp);
+ vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+ }
+#else // !defined(__aarch64__)
+ int16x8_t filter[8];
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ filter[x] = vld1q_s16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8(filter);
+ int32x4_t sum_low = vdupq_n_s32(0);
+ int32x4_t sum_high = sum_low;
+ for (int k = 0; k < 8; ++k) {
+ const int16_t intermediate = intermediate_result_column[y + k];
+ sum_low =
+ vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate);
+ sum_high =
+ vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate);
+ }
+ const int16x8_t sum =
+ vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+ vrshrn_n_s32(sum_high, kRoundBitsVertical));
+ if (is_compound) {
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+ } else {
+ vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+ }
+#endif // defined(__aarch64__)
+ dst_row += dest_stride;
+ sy4 += delta;
+ }
+ // End of region 2. Continue the |start_x| do-while loop.
+ start_x += 8;
+ continue;
+ }
+
+ // Regions 3 and 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ // Region 3.
+ // Horizontal filter.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+ // Convert src_row_v to int8 (subtract 128).
+ const int8x16_t src_row_centered =
+ vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ HorizontalFilter(sx4, alpha, src_row_centered,
+ intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+ } else {
+ // Region 4.
+ // Horizontal filter.
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+ // Convert src_row_v to int8 (subtract 128).
+ const int8x16_t src_row_centered =
+ vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+ HorizontalFilter(sx4, alpha, src_row_centered,
+ intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+ }
+
+ // Regions 3 and 4.
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 =
+ (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ int16x8_t filter[8];
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ filter[x] = vld1q_s16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8(filter);
+ int32x4_t sum_low = vdupq_n_s32(-kOffsetRemoval);
+ int32x4_t sum_high = sum_low;
+ for (int k = 0; k < 8; ++k) {
+ const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]);
+ sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
+ vget_low_s16(intermediate));
+ sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
+ vget_high_s16(intermediate));
+ }
+ const int16x8_t sum =
+ vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+ vrshrn_n_s32(sum_high, kRoundBitsVertical));
+ if (is_compound) {
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+ } else {
+ vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+ }
+ dst_row += dest_stride;
+ sy4 += delta;
+ }
+ start_x += 8;
+ } while (start_x < block_start_x + block_width);
+ dst += 8 * dest_stride;
+ start_y += 8;
+ } while (start_y < block_start_y + block_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->warp = Warp_NEON</*is_compound=*/false>;
+ dsp->warp_compound = Warp_NEON</*is_compound=*/true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void WarpInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/warp_neon.h b/src/dsp/arm/warp_neon.h
new file mode 100644
index 0000000..dbcaa23
--- /dev/null
+++ b/src/dsp/arm/warp_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
new file mode 100644
index 0000000..49d3be0
--- /dev/null
+++ b/src/dsp/arm/weight_mask_neon.cc
@@ -0,0 +1,463 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/weight_mask_neon.h"
+
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kRoundingBits8bpp = 4;
+
+template <bool mask_is_inverse>
+inline void WeightMask8_NEON(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* mask) {
+ const int16x8_t pred_0 = vld1q_s16(prediction_0);
+ const int16x8_t pred_1 = vld1q_s16(prediction_1);
+ const uint8x8_t difference_offset = vdup_n_u8(38);
+ const uint8x8_t mask_ceiling = vdup_n_u8(64);
+ const uint16x8_t difference = vrshrq_n_u16(
+ vreinterpretq_u16_s16(vabdq_s16(pred_0, pred_1)), kRoundingBits8bpp);
+ const uint8x8_t adjusted_difference =
+ vqadd_u8(vqshrn_n_u16(difference, 4), difference_offset);
+ const uint8x8_t mask_value = vmin_u8(adjusted_difference, mask_ceiling);
+ if (mask_is_inverse) {
+ const uint8x8_t inverted_mask_value = vsub_u8(mask_ceiling, mask_value);
+ vst1_u8(mask, inverted_mask_value);
+ } else {
+ vst1_u8(mask, mask_value);
+ }
+}
+
+#define WEIGHT8_WITHOUT_STRIDE \
+ WeightMask8_NEON<mask_is_inverse>(pred_0, pred_1, mask)
+
+#define WEIGHT8_AND_STRIDE \
+ WEIGHT8_WITHOUT_STRIDE; \
+ pred_0 += 8; \
+ pred_1 += 8; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask8x8_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ } while (++y < 7);
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE \
+ WeightMask8_NEON<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+
+#define WEIGHT16_AND_STRIDE \
+ WEIGHT16_WITHOUT_STRIDE; \
+ pred_0 += 16; \
+ pred_1 += 16; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ } while (++y < 7);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask8_NEON<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+
+#define WEIGHT32_AND_STRIDE \
+ WEIGHT32_WITHOUT_STRIDE; \
+ pred_0 += 32; \
+ pred_1 += 32; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask8_NEON<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
+ WeightMask8_NEON<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+
+#define WEIGHT64_AND_STRIDE \
+ WEIGHT64_WITHOUT_STRIDE; \
+ pred_0 += 64; \
+ pred_1 += 64; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 42);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_NEON(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 42);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_NEON<0>; \
+ dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_NEON<1>
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/weight_mask_neon.h b/src/dsp/arm/weight_mask_neon.h
new file mode 100644
index 0000000..b4749ec
--- /dev/null
+++ b/src/dsp/arm/weight_mask_neon.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc
new file mode 100644
index 0000000..a59abb0
--- /dev/null
+++ b/src/dsp/average_blend.cc
@@ -0,0 +1,101 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void AverageBlend_C(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ // 7.11.3.2 Rounding variables derivation process
+ // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+ constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+ const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+ auto* dst = static_cast<Pixel*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ // See warp.cc and convolve.cc for detailed prediction ranges.
+ int res = pred_0[x] + pred_1[x];
+ res -= (bitdepth == 8) ? 0 : kCompoundOffset + kCompoundOffset;
+ dst[x] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(res, inter_post_round_bits + 1), 0,
+ (1 << bitdepth) - 1));
+ } while (++x < width);
+
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+ dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+ dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#endif
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+ dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void AverageBlendInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/average_blend.h b/src/dsp/average_blend.h
new file mode 100644
index 0000000..02ecd09
--- /dev/null
+++ b/src/dsp/average_blend.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+#define LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/average_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/average_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
diff --git a/src/dsp/cdef.cc b/src/dsp/cdef.cc
new file mode 100644
index 0000000..0b50517
--- /dev/null
+++ b/src/dsp/cdef.cc
@@ -0,0 +1,306 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Silence unused function warnings when CdefDirection_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
+ (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105};
+
+int32_t Square(int32_t x) { return x * x; }
+
+template <int bitdepth, typename Pixel>
+void CdefDirection_C(const void* const source, ptrdiff_t stride,
+ uint8_t* const direction, int* const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const Pixel*>(source);
+ stride /= sizeof(Pixel);
+ int32_t cost[8] = {};
+ // |partial| does not have to be int32_t for 8bpp. int16_t will suffice. We
+ // use int32_t to keep it simple since |cost| will have to be int32_t.
+ int32_t partial[8][15] = {};
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ const int x = (src[j] >> (bitdepth - 8)) - 128;
+ partial[0][i + j] += x;
+ partial[1][i + j / 2] += x;
+ partial[2][i] += x;
+ partial[3][3 + i - j / 2] += x;
+ partial[4][7 + i - j] += x;
+ partial[5][3 - i / 2 + j] += x;
+ partial[6][j] += x;
+ partial[7][i / 2 + j] += x;
+ }
+ src += stride;
+ }
+ for (int i = 0; i < 8; ++i) {
+ cost[2] += Square(partial[2][i]);
+ cost[6] += Square(partial[6][i]);
+ }
+ cost[2] *= kDivisionTable[7];
+ cost[6] *= kDivisionTable[7];
+ for (int i = 0; i < 7; ++i) {
+ cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+ kDivisionTable[i];
+ cost[4] += (Square(partial[4][i]) + Square(partial[4][14 - i])) *
+ kDivisionTable[i];
+ }
+ cost[0] += Square(partial[0][7]) * kDivisionTable[7];
+ cost[4] += Square(partial[4][7]) * kDivisionTable[7];
+ for (int i = 1; i < 8; i += 2) {
+ for (int j = 0; j < 5; ++j) {
+ cost[i] += Square(partial[i][3 + j]);
+ }
+ cost[i] *= kDivisionTable[7];
+ for (int j = 0; j < 3; ++j) {
+ cost[i] += (Square(partial[i][j]) + Square(partial[i][10 - j])) *
+ kDivisionTable[2 * j + 1];
+ }
+ }
+ int32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+ // !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||
+ // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+ // !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+
+// Silence unused function warnings when CdefFilter_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \
+ (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+
+int Constrain(int diff, int threshold, int damping) {
+ assert(threshold != 0);
+ damping = std::max(0, damping - FloorLog2(threshold));
+ const int sign = (diff < 0) ? -1 : 1;
+ return sign *
+ Clip3(threshold - (std::abs(diff) >> damping), 0, std::abs(diff));
+}
+
+// Filters the source block. It doesn't check whether the candidate pixel is
+// inside the frame. However it requires the source input to be padded with a
+// constant large value (kCdefLargeValue) if at the boundary.
+template <int block_width, int bitdepth, typename Pixel,
+ bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_C(const uint16_t* src, const ptrdiff_t src_stride,
+ const int block_height, const int primary_strength,
+ const int secondary_strength, const int damping,
+ const int direction, void* const dest,
+ const ptrdiff_t dest_stride) {
+ static_assert(block_width == 4 || block_width == 8, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
+ assert(block_height == 4 || block_height == 8);
+ assert(direction >= 0 && direction <= 7);
+ constexpr int coeff_shift = bitdepth - 8;
+ // Section 5.9.19. CDEF params syntax.
+ assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift);
+ assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift &&
+ secondary_strength != 3 << coeff_shift);
+ assert(primary_strength != 0 || secondary_strength != 0);
+ // damping is decreased by 1 for chroma.
+ assert((damping >= 3 && damping <= 6 + coeff_shift) ||
+ (damping >= 2 && damping <= 5 + coeff_shift));
+ // When only primary_strength or secondary_strength are non-zero the number
+ // of pixels inspected (4 for primary_strength, 8 for secondary_strength) and
+ // the taps used don't exceed the amount the sum is
+ // descaled by (16) so we can skip tracking and clipping to the minimum and
+ // maximum value observed.
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
+ kCdefSecondaryTap1};
+ auto* dst = static_cast<Pixel*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+ int y = block_height;
+ do {
+ int x = 0;
+ do {
+ int16_t sum = 0;
+ const uint16_t pixel_value = src[x];
+ uint16_t max_value = pixel_value;
+ uint16_t min_value = pixel_value;
+ for (int k = 0; k < 2; ++k) {
+ static constexpr int signs[] = {-1, 1};
+ for (const int& sign : signs) {
+ if (enable_primary) {
+ const int dy = sign * kCdefDirections[direction][k][0];
+ const int dx = sign * kCdefDirections[direction][k][1];
+ const uint16_t value = src[dy * src_stride + dx + x];
+ // Note: the summation can ignore the condition check in SIMD
+ // implementation, because Constrain() will return 0 when
+ // value == kCdefLargeValue.
+ if (value != kCdefLargeValue) {
+ sum += Constrain(value - pixel_value, primary_strength, damping) *
+ kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][k];
+ if (clipping_required) {
+ max_value = std::max(value, max_value);
+ min_value = std::min(value, min_value);
+ }
+ }
+ }
+
+ if (enable_secondary) {
+ static constexpr int offsets[] = {-2, 2};
+ for (const int& offset : offsets) {
+ const int dy = sign * kCdefDirections[direction + offset][k][0];
+ const int dx = sign * kCdefDirections[direction + offset][k][1];
+ const uint16_t value = src[dy * src_stride + dx + x];
+ // Note: the summation can ignore the condition check in SIMD
+ // implementation.
+ if (value != kCdefLargeValue) {
+ sum += Constrain(value - pixel_value, secondary_strength,
+ damping) *
+ kCdefSecondaryTaps[k];
+ if (clipping_required) {
+ max_value = std::max(value, max_value);
+ min_value = std::min(value, min_value);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ const int offset = (8 + sum - (sum < 0)) >> 4;
+ if (clipping_required) {
+ dst[x] = static_cast<Pixel>(
+ Clip3(pixel_value + offset, min_value, max_value));
+ } else {
+ dst[x] = static_cast<Pixel>(pixel_value + offset);
+ }
+ } while (++x < block_width);
+
+ src += src_stride;
+ dst += dst_stride;
+ } while (--y != 0);
+}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+ // !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||
+ // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+ // !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+ dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+ dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+ dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+ dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+ dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_CdefDirection
+ dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_CdefFilters
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void CdefInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h
new file mode 100644
index 0000000..2d70d2c
--- /dev/null
+++ b/src/dsp/cdef.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CDEF_H_
+#define LIBGAV1_SRC_DSP_CDEF_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/cdef_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/cdef_sse4.h"
+// clang-format on
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_CDEF_H_
diff --git a/src/dsp/cdef.inc b/src/dsp/cdef.inc
new file mode 100644
index 0000000..c1a3136
--- /dev/null
+++ b/src/dsp/cdef.inc
@@ -0,0 +1,29 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for cdef implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2;
+
+// Mirror values and pad to 16 elements.
+alignas(16) constexpr uint32_t kCdefDivisionTable[] = {
+ 840, 420, 280, 210, 168, 140, 120, 105,
+ 120, 140, 168, 210, 280, 420, 840, 0};
+
+// Used when calculating odd |cost[x]| values to mask off unwanted elements.
+// Holds elements 1 3 5 X 5 3 1 X
+alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0,
+ 140, 210, 420, 0};
diff --git a/src/dsp/common.h b/src/dsp/common.h
new file mode 100644
index 0000000..d614a81
--- /dev/null
+++ b/src/dsp/common.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_COMMON_H_
+#define LIBGAV1_SRC_DSP_COMMON_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum { kSgrStride = kRestorationUnitWidth + 32 }; // anonymous enum
+
+// Self guided projection filter.
+struct SgrProjInfo {
+ int index;
+ int multiplier[2];
+};
+
+struct WienerInfo {
+ static const int kVertical = 0;
+ static const int kHorizontal = 1;
+ int16_t number_leading_zero_coefficients[2];
+ alignas(kMaxAlignment) int16_t filter[2][(kWienerFilterTaps + 1) / 2];
+};
+
+struct RestorationUnitInfo : public MaxAlignedAllocable {
+ LoopRestorationType type;
+ SgrProjInfo sgr_proj_info;
+ WienerInfo wiener_info;
+};
+
+struct SgrBuffer {
+ alignas(kMaxAlignment) uint16_t sum3[4 * kSgrStride];
+ alignas(kMaxAlignment) uint16_t sum5[5 * kSgrStride];
+ alignas(kMaxAlignment) uint32_t square_sum3[4 * kSgrStride];
+ alignas(kMaxAlignment) uint32_t square_sum5[5 * kSgrStride];
+ alignas(kMaxAlignment) uint16_t ma343[4 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint16_t ma444[3 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint16_t ma565[2 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint32_t b343[4 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint32_t b444[3 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint32_t b565[2 * kRestorationUnitWidth];
+ // The following 2 buffers are only used by the C functions. Since SgrBuffer
+ // is smaller than |wiener_buffer| in RestorationBuffer which is an union,
+ // it's OK to always keep the following 2 buffers.
+ alignas(kMaxAlignment) uint8_t ma[kSgrStride]; // [0, 255]
+ // b is less than 2^16 for 8-bit. However, making it a template slows down the
+ // C function by 5%. So b is fixed to 32-bit.
+ alignas(kMaxAlignment) uint32_t b[kSgrStride];
+};
+
+union RestorationBuffer {
+ // For self-guided filter.
+ SgrBuffer sgr_buffer;
+ // For wiener filter.
+ // The array |intermediate| in Section 7.17.4, the intermediate results
+ // between the horizontal and vertical filters.
+ alignas(kMaxAlignment) int16_t
+ wiener_buffer[(kRestorationUnitHeight + kWienerFilterTaps - 1) *
+ kRestorationUnitWidth];
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_COMMON_H_
diff --git a/src/dsp/constants.cc b/src/dsp/constants.cc
new file mode 100644
index 0000000..0099ca3
--- /dev/null
+++ b/src/dsp/constants.cc
@@ -0,0 +1,103 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/constants.h"
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// Each set of 7 taps is padded with a 0 to easily align and pack into the high
+// and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
+const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+ {{-6, 10, 0, 0, 0, 12, 0, 0},
+ {-5, 2, 10, 0, 0, 9, 0, 0},
+ {-3, 1, 1, 10, 0, 7, 0, 0},
+ {-3, 1, 1, 2, 10, 5, 0, 0},
+ {-4, 6, 0, 0, 0, 2, 12, 0},
+ {-3, 2, 6, 0, 0, 2, 9, 0},
+ {-3, 2, 2, 6, 0, 2, 7, 0},
+ {-3, 1, 2, 2, 6, 3, 5, 0}},
+ {{-10, 16, 0, 0, 0, 10, 0, 0},
+ {-6, 0, 16, 0, 0, 6, 0, 0},
+ {-4, 0, 0, 16, 0, 4, 0, 0},
+ {-2, 0, 0, 0, 16, 2, 0, 0},
+ {-10, 16, 0, 0, 0, 0, 10, 0},
+ {-6, 0, 16, 0, 0, 0, 6, 0},
+ {-4, 0, 0, 16, 0, 0, 4, 0},
+ {-2, 0, 0, 0, 16, 0, 2, 0}},
+ {{-8, 8, 0, 0, 0, 16, 0, 0},
+ {-8, 0, 8, 0, 0, 16, 0, 0},
+ {-8, 0, 0, 8, 0, 16, 0, 0},
+ {-8, 0, 0, 0, 8, 16, 0, 0},
+ {-4, 4, 0, 0, 0, 0, 16, 0},
+ {-4, 0, 4, 0, 0, 0, 16, 0},
+ {-4, 0, 0, 4, 0, 0, 16, 0},
+ {-4, 0, 0, 0, 4, 0, 16, 0}},
+ {{-2, 8, 0, 0, 0, 10, 0, 0},
+ {-1, 3, 8, 0, 0, 6, 0, 0},
+ {-1, 2, 3, 8, 0, 4, 0, 0},
+ {0, 1, 2, 3, 8, 2, 0, 0},
+ {-1, 4, 0, 0, 0, 3, 10, 0},
+ {-1, 3, 4, 0, 0, 4, 6, 0},
+ {-1, 2, 3, 4, 0, 4, 4, 0},
+ {-1, 2, 2, 3, 4, 3, 3, 0}},
+ {{-12, 14, 0, 0, 0, 14, 0, 0},
+ {-10, 0, 14, 0, 0, 12, 0, 0},
+ {-9, 0, 0, 14, 0, 11, 0, 0},
+ {-8, 0, 0, 0, 14, 10, 0, 0},
+ {-10, 12, 0, 0, 0, 0, 14, 0},
+ {-9, 1, 12, 0, 0, 0, 12, 0},
+ {-8, 0, 0, 12, 0, 1, 11, 0},
+ {-7, 0, 0, 1, 12, 1, 9, 0}}};
+
+// A lookup table replacing the calculation of the variable s in Section 7.17.3
+// (Box filter process). The first index is sgr_proj_index (the lr_sgr_set
+// syntax element in the Spec, saved in the sgr_proj_info.index field of a
+// RestorationUnitInfo struct). The second index is pass (0 or 1).
+//
+// const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
+// const uint32_t n2_with_scale = n * n * scale;
+// const uint32_t s =
+// ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
+// 0 is an invalid value, corresponding to radius = 0, where the filter is
+// skipped.
+const uint16_t kSgrScaleParameter[16][2] = {
+ {140, 3236}, {112, 2158}, {93, 1618}, {80, 1438}, {70, 1295}, {58, 1177},
+ {47, 1079}, {37, 996}, {30, 925}, {25, 863}, {0, 2589}, {0, 1618},
+ {0, 1177}, {0, 925}, {56, 0}, {22, 0},
+};
+
+const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}};
+
+// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+// beginning and end of the table. The cdef direction range is [0, 7] and the
+// first index is offset +/-2. This removes the need to constrain the first
+// index to the same range using e.g., & 7.
+const int8_t kCdefDirectionsPadded[12][2][2] = {
+ {{1, 0}, {2, 0}}, // Padding: Cdef_Directions[6]
+ {{1, 0}, {2, -1}}, // Padding: Cdef_Directions[7]
+ {{-1, 1}, {-2, 2}}, // Begin Cdef_Directions
+ {{0, 1}, {-1, 2}}, //
+ {{0, 1}, {0, 2}}, //
+ {{0, 1}, {1, 2}}, //
+ {{1, 1}, {2, 2}}, //
+ {{1, 0}, {2, 1}}, //
+ {{1, 0}, {2, 0}}, //
+ {{1, 0}, {2, -1}}, // End Cdef_Directions
+ {{-1, 1}, {-2, 2}}, // Padding: Cdef_Directions[0]
+ {{0, 1}, {-1, 2}}, // Padding: Cdef_Directions[1]
+};
+
+} // namespace libgav1
diff --git a/src/dsp/constants.h b/src/dsp/constants.h
new file mode 100644
index 0000000..7c1b62c
--- /dev/null
+++ b/src/dsp/constants.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONSTANTS_H_
+#define LIBGAV1_SRC_DSP_CONSTANTS_H_
+
+// This file contains DSP related constants that have a direct relationship with
+// a DSP component.
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+enum {
+ // Documentation variables.
+ kBitdepth8 = 8,
+ kBitdepth10 = 10,
+ kBitdepth12 = 12,
+ // Weights are quadratic from '1' to '1 / block_size', scaled by
+ // 2^kSmoothWeightScale.
+ kSmoothWeightScale = 8,
+ kCflLumaBufferStride = 32,
+ // InterRound0, Section 7.11.3.2.
+ kInterRoundBitsHorizontal = 3, // 8 & 10-bit.
+ kInterRoundBitsHorizontal12bpp = 5,
+ kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction.
+ kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction.
+ kInterRoundBitsVertical12bpp = 9,
+ // Offset applied to 10bpp and 12bpp predictors to allow storing them in
+ // uint16_t. Removed before blending.
+ kCompoundOffset = (1 << 14) + (1 << 13),
+ kCdefSecondaryTap0 = 2,
+ kCdefSecondaryTap1 = 1,
+}; // anonymous enum
+
+extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8];
+
+// Values in this enum can be derived as the sum of subsampling_x and
+// subsampling_y (since subsampling_x == 0 && subsampling_y == 1 case is never
+// allowed by the bitstream).
+enum SubsamplingType : uint8_t {
+ kSubsamplingType444, // subsampling_x = 0, subsampling_y = 0.
+ kSubsamplingType422, // subsampling_x = 1, subsampling_y = 0.
+ kSubsamplingType420, // subsampling_x = 1, subsampling_y = 1.
+ kNumSubsamplingTypes
+};
+
+extern const uint16_t kSgrScaleParameter[16][2];
+
+extern const uint8_t kCdefPrimaryTaps[2][2];
+
+extern const int8_t kCdefDirectionsPadded[12][2][2];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_CONSTANTS_H_
diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc
new file mode 100644
index 0000000..8c6f68f
--- /dev/null
+++ b/src/dsp/convolve.cc
@@ -0,0 +1,876 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kHorizontalOffset = 3;
+constexpr int kVerticalOffset = 3;
+
+// Compound prediction output ranges from ConvolveTest.ShowRange.
+// Bitdepth: 8 Input range: [ 0, 255]
+// intermediate range: [ -7140, 23460]
+// first pass output range: [ -1785, 5865]
+// intermediate range: [ -328440, 589560]
+// second pass output range: [ 0, 255]
+// compound second pass output range: [ -5132, 9212]
+//
+// Bitdepth: 10 Input range: [ 0, 1023]
+// intermediate range: [ -28644, 94116]
+// first pass output range: [ -7161, 23529]
+// intermediate range: [-1317624, 2365176]
+// second pass output range: [ 0, 1023]
+// compound second pass output range: [ 3988, 61532]
+//
+// Bitdepth: 12 Input range: [ 0, 4095]
+// intermediate range: [ -114660, 376740]
+// first pass output range: [ -7166, 23546]
+// intermediate range: [-1318560, 2366880]
+// second pass output range: [ 0, 4095]
+// compound second pass output range: [ 3974, 61559]
+
+template <int bitdepth, typename Pixel>
+void ConvolveScale2D_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index, const int subpixel_x,
+ const int subpixel_y, const int step_x, const int step_y,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical =
+ (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ kSubPixelTaps;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (2 * kMaxSuperBlockSizeInPixels + 8)];
+ const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+ const int max_pixel_value = (1 << bitdepth) - 1;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ // Note: assume the input src is already aligned to the correct start
+ // position.
+ int y = 0;
+ do {
+ int p = subpixel_x;
+ int x = 0;
+ do {
+ int sum = 0;
+ const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+ }
+ intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ p += step_x;
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += intermediate_stride;
+ } while (++y < intermediate_height);
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ int p = subpixel_y & 1023;
+ y = 0;
+ do {
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum +=
+ kHalfSubPixelFilters[filter_index][filter_id][k] *
+ intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+ x];
+ }
+ dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+ max_pixel_value);
+ } while (++x < width);
+
+ dest += dest_stride;
+ p += step_y;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundScale2D_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int subpixel_x, const int subpixel_y,
+ const int step_x, const int step_y,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ kSubPixelTaps;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (2 * kMaxSuperBlockSizeInPixels + 8)];
+ const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ // Note: assume the input src is already aligned to the correct start
+ // position.
+ int y = 0;
+ do {
+ int p = subpixel_x;
+ int x = 0;
+ do {
+ int sum = 0;
+ const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+ }
+ intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ p += step_x;
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += intermediate_stride;
+ } while (++y < intermediate_height);
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ int p = subpixel_y & 1023;
+ y = 0;
+ do {
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum +=
+ kHalfSubPixelFilters[filter_index][filter_id][k] *
+ intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+ x];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dest[x] = sum;
+ } while (++x < width);
+
+ dest += pred_stride;
+ p += step_y;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompound2D_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+ const int intermediate_height = height + kSubPixelTaps - 1;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ const auto* src = static_cast<const Pixel*>(reference) -
+ kVerticalOffset * src_stride - kHorizontalOffset;
+ auto* dest = static_cast<uint16_t*>(prediction);
+
+ // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+ assert(horizontal_filter_id != 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
+ }
+ intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += intermediate_stride;
+ } while (++y < intermediate_height);
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+ assert(vertical_filter_id != 0);
+ y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+ intermediate[k * intermediate_stride + x];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dest[x] = sum;
+ } while (++x < width);
+
+ dest += pred_stride;
+ intermediate += intermediate_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is single prediction mode, where both horizontal and
+// vertical filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void Convolve2D_C(const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id, const int vertical_filter_id,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical =
+ (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+ const int intermediate_height = height + kSubPixelTaps - 1;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+ const int max_pixel_value = (1 << bitdepth) - 1;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ const auto* src = static_cast<const Pixel*>(reference) -
+ kVerticalOffset * src_stride - kHorizontalOffset;
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+ assert(horizontal_filter_id != 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
+ }
+ intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += intermediate_stride;
+ } while (++y < intermediate_height);
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+ assert(vertical_filter_id != 0);
+ y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+ intermediate[k * intermediate_stride + x];
+ }
+ dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+ max_pixel_value);
+ } while (++x < width);
+
+ dest += dest_stride;
+ intermediate += intermediate_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only horizontal
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveHorizontal_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int /*vertical_filter_index*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int bits = kFilterBits - kRoundBitsHorizontal;
+ const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ const int max_pixel_value = (1 << bitdepth) - 1;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value);
+ } while (++x < width);
+
+ src += src_stride;
+ dest += dest_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveVertical_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ const auto* src =
+ static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ // Copy filters must call ConvolveCopy().
+ assert(vertical_filter_id != 0);
+
+ const int max_pixel_value = (1 << bitdepth) - 1;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+ src[k * src_stride + x];
+ }
+ dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0,
+ max_pixel_value);
+ } while (++x < width);
+
+ src += src_stride;
+ dest += dest_stride;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCopy_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ int y = 0;
+ do {
+ memcpy(dest, src, width * sizeof(Pixel));
+ src += reference_stride;
+ dest += pred_stride;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundCopy_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsVertical =
+ ((bitdepth == 12) ? kInterRoundBitsVertical12bpp
+ : kInterRoundBitsVertical) -
+ kInterRoundBitsCompoundVertical;
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = (bitdepth == 8) ? 0 : ((1 << bitdepth) + (1 << (bitdepth - 1)));
+ sum += src[x];
+ dest[x] = sum << kRoundBitsVertical;
+ } while (++x < width);
+ src += src_stride;
+ dest += pred_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only horizontal
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundHorizontal_C(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index, const int /*vertical_filter_index*/,
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ // Copy filters must call ConvolveCopy().
+ assert(horizontal_filter_id != 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dest[x] = sum;
+ } while (++x < width);
+
+ src += src_stride;
+ dest += pred_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only vertical
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundVertical_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ const auto* src =
+ static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ // Copy filters must call ConvolveCopy().
+ assert(vertical_filter_id != 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+ src[k * src_stride + x];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dest[x] = sum;
+ } while (++x < width);
+ src += src_stride;
+ dest += pred_stride;
+ } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from current frame and both horizontal and vertical
+// filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveIntraBlockCopy2D_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ const int intermediate_height = height + 1;
+ uint16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + 1)];
+ uint16_t* intermediate = intermediate_result;
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ intermediate[x] = src[x] + src[x + 1];
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += width;
+ } while (++y < intermediate_height);
+
+ intermediate = intermediate_result;
+ y = 0;
+ do {
+ int x = 0;
+ do {
+ dest[x] =
+ RightShiftWithRounding(intermediate[x] + intermediate[x + width], 2);
+ } while (++x < width);
+
+ intermediate += width;
+ dest += dest_stride;
+ } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from the current frame and only horizontal or vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+// The filtering of intra block copy is simply the average of current and
+// the next pixel.
+template <int bitdepth, typename Pixel, bool is_horizontal>
+void ConvolveIntraBlockCopy1D_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ const ptrdiff_t offset = is_horizontal ? 1 : src_stride;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ dest[x] = RightShiftWithRounding(src[x] + src[x + offset], 1);
+ } while (++x < width);
+
+ src += src_stride;
+ dest += dest_stride;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+ dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCopy
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+ dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+#endif
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+ dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+ dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCopy
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveHorizontal
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveVertical
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Convolve2D
+ dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundCopy
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundVertical
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompound2D
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockHorizontal
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockVertical
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlock2D
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+#endif
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveScale2D
+ dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void ConvolveInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/convolve.h b/src/dsp/convolve.h
new file mode 100644
index 0000000..5bc0bad
--- /dev/null
+++ b/src/dsp/convolve.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_
+#define LIBGAV1_SRC_DSP_CONVOLVE_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/convolve_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/convolve_avx2.h"
+#include "src/dsp/x86/convolve_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve and Dsp::convolve_scale. This function is not
+// thread-safe.
+void ConvolveInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_CONVOLVE_H_
diff --git a/src/dsp/convolve.inc b/src/dsp/convolve.inc
new file mode 100644
index 0000000..140648b
--- /dev/null
+++ b/src/dsp/convolve.inc
@@ -0,0 +1,50 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+int GetNumTapsInFilter(const int filter_index) {
+ if (filter_index < 2) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
+constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
+constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;
diff --git a/src/dsp/distance_weighted_blend.cc b/src/dsp/distance_weighted_blend.cc
new file mode 100644
index 0000000..a035fbe
--- /dev/null
+++ b/src/dsp/distance_weighted_blend.cc
@@ -0,0 +1,101 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlend_C(const void* prediction_0, const void* prediction_1,
+ const uint8_t weight_0, const uint8_t weight_1,
+ const int width, const int height,
+ void* const dest, const ptrdiff_t dest_stride) {
+ // 7.11.3.2 Rounding variables derivation process
+ // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+ constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+ const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+ auto* dst = static_cast<Pixel*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ // See warp.cc and convolve.cc for detailed prediction ranges.
+ // weight_0 + weight_1 = 16.
+ int res = pred_0[x] * weight_0 + pred_1[x] * weight_1;
+ res -= (bitdepth == 8) ? 0 : kCompoundOffset * 16;
+ dst[x] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(res, inter_post_round_bits + 4), 0,
+ (1 << bitdepth) - 1));
+ } while (++x < width);
+
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void DistanceWeightedBlendInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend.h b/src/dsp/distance_weighted_blend.h
new file mode 100644
index 0000000..1a782b6
--- /dev/null
+++ b/src/dsp/distance_weighted_blend.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+#define LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/distance_weighted_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/distance_weighted_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
new file mode 100644
index 0000000..5b54c4e
--- /dev/null
+++ b/src/dsp/dsp.cc
@@ -0,0 +1,150 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/arm/weight_mask_neon.h"
+#include "src/dsp/average_blend.h"
+#include "src/dsp/cdef.h"
+#include "src/dsp/convolve.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/film_grain.h"
+#include "src/dsp/intra_edge.h"
+#include "src/dsp/intrapred.h"
+#include "src/dsp/inverse_transform.h"
+#include "src/dsp/loop_filter.h"
+#include "src/dsp/loop_restoration.h"
+#include "src/dsp/mask_blend.h"
+#include "src/dsp/motion_field_projection.h"
+#include "src/dsp/motion_vector_search.h"
+#include "src/dsp/obmc.h"
+#include "src/dsp/super_res.h"
+#include "src/dsp/warp.h"
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp_internal {
+
+dsp::Dsp* GetWritableDspTable(int bitdepth) {
+ switch (bitdepth) {
+ case 8: {
+ static dsp::Dsp dsp_8bpp;
+ return &dsp_8bpp;
+ }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10: {
+ static dsp::Dsp dsp_10bpp;
+ return &dsp_10bpp;
+ }
+#endif
+ }
+ return nullptr;
+}
+
+} // namespace dsp_internal
+
+namespace dsp {
+
+void DspInit() {
+ static std::once_flag once;
+ std::call_once(once, []() {
+ AverageBlendInit_C();
+ CdefInit_C();
+ ConvolveInit_C();
+ DistanceWeightedBlendInit_C();
+ FilmGrainInit_C();
+ IntraEdgeInit_C();
+ IntraPredInit_C();
+ InverseTransformInit_C();
+ LoopFilterInit_C();
+ LoopRestorationInit_C();
+ MaskBlendInit_C();
+ MotionFieldProjectionInit_C();
+ MotionVectorSearchInit_C();
+ ObmcInit_C();
+ SuperResInit_C();
+ WarpInit_C();
+ WeightMaskInit_C();
+#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+ const uint32_t cpu_features = GetCpuInfo();
+#if LIBGAV1_ENABLE_SSE4_1
+ if ((cpu_features & kSSE4_1) != 0) {
+ AverageBlendInit_SSE4_1();
+ CdefInit_SSE4_1();
+ ConvolveInit_SSE4_1();
+ DistanceWeightedBlendInit_SSE4_1();
+ IntraEdgeInit_SSE4_1();
+ IntraPredInit_SSE4_1();
+ IntraPredCflInit_SSE4_1();
+ IntraPredSmoothInit_SSE4_1();
+ InverseTransformInit_SSE4_1();
+ LoopFilterInit_SSE4_1();
+ LoopRestorationInit_SSE4_1();
+ MaskBlendInit_SSE4_1();
+ MotionFieldProjectionInit_SSE4_1();
+ MotionVectorSearchInit_SSE4_1();
+ ObmcInit_SSE4_1();
+ SuperResInit_SSE4_1();
+ WarpInit_SSE4_1();
+ WeightMaskInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_SSE4_1();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_AVX2
+ if ((cpu_features & kAVX2) != 0) {
+ ConvolveInit_AVX2();
+ LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_AVX2();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif // LIBGAV1_ENABLE_AVX2
+#endif // LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+#if LIBGAV1_ENABLE_NEON
+ AverageBlendInit_NEON();
+ CdefInit_NEON();
+ ConvolveInit_NEON();
+ DistanceWeightedBlendInit_NEON();
+ FilmGrainInit_NEON();
+ IntraEdgeInit_NEON();
+ IntraPredCflInit_NEON();
+ IntraPredDirectionalInit_NEON();
+ IntraPredFilterIntraInit_NEON();
+ IntraPredInit_NEON();
+ IntraPredSmoothInit_NEON();
+ InverseTransformInit_NEON();
+ LoopFilterInit_NEON();
+ LoopRestorationInit_NEON();
+ MaskBlendInit_NEON();
+ MotionFieldProjectionInit_NEON();
+ MotionVectorSearchInit_NEON();
+ ObmcInit_NEON();
+ SuperResInit_NEON();
+ WarpInit_NEON();
+ WeightMaskInit_NEON();
+#endif // LIBGAV1_ENABLE_NEON
+ });
+}
+
+const Dsp* GetDspTable(int bitdepth) {
+ return dsp_internal::GetWritableDspTable(bitdepth);
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
new file mode 100644
index 0000000..fcbac3a
--- /dev/null
+++ b/src/dsp/dsp.h
@@ -0,0 +1,910 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DSP_H_
+#define LIBGAV1_SRC_DSP_DSP_H_
+
+#include <cstddef> // ptrdiff_t
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+
+#if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS)
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0
+#endif
+
+enum IntraPredictor : uint8_t {
+ kIntraPredictorDcFill,
+ kIntraPredictorDcTop,
+ kIntraPredictorDcLeft,
+ kIntraPredictorDc,
+ kIntraPredictorVertical,
+ kIntraPredictorHorizontal,
+ kIntraPredictorPaeth,
+ kIntraPredictorSmooth,
+ kIntraPredictorSmoothVertical,
+ kIntraPredictorSmoothHorizontal,
+ kNumIntraPredictors
+};
+
+// List of valid 1D transforms.
+enum Transform1D : uint8_t {
+ k1DTransformDct, // Discrete Cosine Transform.
+ k1DTransformAdst, // Asymmetric Discrete Sine Transform.
+ k1DTransformIdentity,
+ k1DTransformWht, // Walsh Hadamard Transform.
+ kNum1DTransforms
+};
+
+// List of valid 1D transform sizes. Not all transforms may be available for all
+// the sizes.
+enum TransformSize1D : uint8_t {
+ k1DTransformSize4,
+ k1DTransformSize8,
+ k1DTransformSize16,
+ k1DTransformSize32,
+ k1DTransformSize64,
+ kNum1DTransformSizes
+};
+
+// The maximum width of the loop filter, fewer pixels may be filtered depending
+// on strength thresholds.
+enum LoopFilterSize : uint8_t {
+ kLoopFilterSize4,
+ kLoopFilterSize6,
+ kLoopFilterSize8,
+ kLoopFilterSize14,
+ kNumLoopFilterSizes
+};
+
+enum : uint8_t {
+ kRow = 0,
+ kColumn = 1,
+};
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const IntraPredictor predictor) {
+ switch (predictor) {
+ case kIntraPredictorDcFill:
+ return "kIntraPredictorDcFill";
+ case kIntraPredictorDcTop:
+ return "kIntraPredictorDcTop";
+ case kIntraPredictorDcLeft:
+ return "kIntraPredictorDcLeft";
+ case kIntraPredictorDc:
+ return "kIntraPredictorDc";
+ case kIntraPredictorVertical:
+ return "kIntraPredictorVertical";
+ case kIntraPredictorHorizontal:
+ return "kIntraPredictorHorizontal";
+ case kIntraPredictorPaeth:
+ return "kIntraPredictorPaeth";
+ case kIntraPredictorSmooth:
+ return "kIntraPredictorSmooth";
+ case kIntraPredictorSmoothVertical:
+ return "kIntraPredictorSmoothVertical";
+ case kIntraPredictorSmoothHorizontal:
+ return "kIntraPredictorSmoothHorizontal";
+ case kNumIntraPredictors:
+ return "kNumIntraPredictors";
+ }
+ abort();
+}
+
+inline const char* ToString(const Transform1D transform) {
+ switch (transform) {
+ case k1DTransformDct:
+ return "k1DTransformDct";
+ case k1DTransformAdst:
+ return "k1DTransformAdst";
+ case k1DTransformIdentity:
+ return "k1DTransformIdentity";
+ case k1DTransformWht:
+ return "k1DTransformWht";
+ case kNum1DTransforms:
+ return "kNum1DTransforms";
+ }
+ abort();
+}
+
+inline const char* ToString(const TransformSize1D transform_size) {
+ switch (transform_size) {
+ case k1DTransformSize4:
+ return "k1DTransformSize4";
+ case k1DTransformSize8:
+ return "k1DTransformSize8";
+ case k1DTransformSize16:
+ return "k1DTransformSize16";
+ case k1DTransformSize32:
+ return "k1DTransformSize32";
+ case k1DTransformSize64:
+ return "k1DTransformSize64";
+ case kNum1DTransformSizes:
+ return "kNum1DTransformSizes";
+ }
+ abort();
+}
+
+inline const char* ToString(const LoopFilterSize filter_size) {
+ switch (filter_size) {
+ case kLoopFilterSize4:
+ return "kLoopFilterSize4";
+ case kLoopFilterSize6:
+ return "kLoopFilterSize6";
+ case kLoopFilterSize8:
+ return "kLoopFilterSize8";
+ case kLoopFilterSize14:
+ return "kLoopFilterSize14";
+ case kNumLoopFilterSizes:
+ return "kNumLoopFilterSizes";
+ }
+ abort();
+}
+
+inline const char* ToString(const LoopFilterType filter_type) {
+ switch (filter_type) {
+ case kLoopFilterTypeVertical:
+ return "kLoopFilterTypeVertical";
+ case kLoopFilterTypeHorizontal:
+ return "kLoopFilterTypeHorizontal";
+ case kNumLoopFilterTypes:
+ return "kNumLoopFilterTypes";
+ }
+ abort();
+}
+
+//------------------------------------------------------------------------------
+// Intra predictors. Section 7.11.2.
+// These require access to one or both of the top row and left column. Some may
+// access the top-left (top[-1]), top-right (top[width+N]), bottom-left
+// (left[height+N]) or upper-left (left[-1]).
+
+// Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11),
+// 7.11.2.5, 7.11.2.6.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. top-left and bottom-left may be accessed.
+using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+ const void* top, const void* left);
+using IntraPredictorFuncs =
+ IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors];
+
+// Directional intra predictor function signature, zone 1 (0 < angle < 90).
+// Section 7.11.2.4 (#7).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |width| and |height| give the dimensions of the block.
+// |xstep| is the scaled starting index to |top| from
+// kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether
+// |top| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. top-right
+// is accessed.
+using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
+ const void* top, int width,
+ int height, int xstep,
+ bool upsampled_top);
+
+// Directional intra predictor function signature, zone 2 (90 < angle < 180).
+// Section 7.11.2.4 (#8).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left of
+// |dst|. |width| and |height| give the dimensions of the block. |xstep| and
+// |ystep| are the scaled starting index to |top| and |left|, respectively,
+// from kDirectionalIntraPredictorDerivative. |upsampled_top| and
+// |upsampled_left| indicate whether |top| and |left| have been upsampled as
+// described in '7.11.2.11. Intra edge upsample process'. This can occur in
+// cases with |width| + |height| <= 16. top-left and upper-left are accessed,
+// up to [-2] in each if |upsampled_top/left| are set.
+using DirectionalIntraPredictorZone2Func = void (*)(
+ void* dst, ptrdiff_t stride, const void* top, const void* left, int width,
+ int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left);
+
+// Directional intra predictor function signature, zone 3 (180 < angle < 270).
+// Section 7.11.2.4 (#9).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |left| is an aligned vector of the
+// column to the left of |dst|. |width| and |height| give the dimensions of the
+// block. |ystep| is the scaled starting index to |left| from
+// kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether
+// |left| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. bottom-left
+// is accessed.
+using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
+ const void* left, int width,
+ int height, int ystep,
+ bool upsampled_left);
+
+// Filter intra predictor function signature. Section 7.11.2.3.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. |width| and |height| are the size of the block in pixels.
+using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+ const void* top, const void* left,
+ FilterIntraPredictor pred, int width,
+ int height);
+
+//------------------------------------------------------------------------------
+// Chroma from Luma (Cfl) prediction. Section 7.11.5.
+
+// Chroma from Luma (Cfl) intra prediction function signature. |dst| is an
+// unaligned pointer to the output block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |luma| contains subsampled luma pixels with 3
+// fractional bits of precision. |alpha| is the signed Cfl alpha value for the
+// appropriate plane.
+using CflIntraPredictorFunc = void (*)(
+ void* dst, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha);
+using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes];
+
+// Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned
+// pointer to the output block. |src| is an unaligned pointer to the input
+// block. Pixel size is determined by bitdepth with |stride| given in bytes.
+using CflSubsamplerFunc =
+ void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ int max_luma_width, int max_luma_height, const void* source,
+ ptrdiff_t stride);
+using CflSubsamplerFuncs =
+ CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes];
+
+//------------------------------------------------------------------------------
+// Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4.
+
+// Intra edge filter function signature. |buffer| is a pointer to the top_row or
+// left_column that needs to be filtered. Typically the -1'th index of |top_row|
+// and |left_column| need to be filtered as well, so the caller can merely pass
+// the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be filtered. |strength| is the
+// filter strength. Section 7.11.2.12 in the spec.
+using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength);
+
+// Intra edge upsampler function signature. |buffer| is a pointer to the top_row
+// or left_column that needs to be upsampled. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be upsampled; valid values are:
+// 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of
+// the |buffer|. Section 7.11.2.11 in the spec.
+using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size);
+
+//------------------------------------------------------------------------------
+// Inverse transform add function signature.
+//
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the destination frame
+// for the transform type and block size |tx_size| starting at position
+// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D.
+// |adjusted_tx_height| is the number of rows to process based on the non-zero
+// coefficient count in the block. It will be 1 (non-zero coefficient count ==
+// 1), 4 or a multiple of 8 up to 32 or the original transform height,
+// whichever is less.
+using InverseTransformAddFunc = void (*)(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame);
+// The final dimension holds row and column transforms indexed with kRow and
+// kColumn.
+using InverseTransformAddFuncs =
+ InverseTransformAddFunc[kNum1DTransforms][kNum1DTransformSizes][2];
+
+//------------------------------------------------------------------------------
+// Post processing.
+
+// Loop filter function signature. Section 7.14.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes.
+using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+using LoopFilterFuncs =
+ LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes];
+
+// Cdef direction function signature. Section 7.15.2.
+// |src| is a pointer to the source block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |direction| and |variance| are output
+// parameters and must not be nullptr.
+using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
+ uint8_t* direction, int* variance);
+
+// Cdef filtering function signature. Section 7.15.3.
+// |source| is a pointer to the input block padded with kCdefLargeValue if at a
+// frame border. |source_stride| is given in units of uint16_t.
+// |block_width|, |block_height| are the width/height of the input block.
+// |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
+// parameters.
+// |direction| is the filtering direction.
+// |dest| is the output buffer. |dest_stride| is given in bytes.
+using CdefFilteringFunc = void (*)(const uint16_t* source,
+ ptrdiff_t source_stride, int block_height,
+ int primary_strength, int secondary_strength,
+ int damping, int direction, void* dest,
+ ptrdiff_t dest_stride);
+
+// The first index is block width: [0]: 4, [1]: 8. The second is based on
+// non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]:
+// |primary_strength| only, [2]: |secondary_strength| only.
+using CdefFilteringFuncs = CdefFilteringFunc[2][3];
+
+// Upscaling coefficients function signature. Section 7.16.
+// This is an auxiliary function for SIMD optimizations and has no corresponding
+// C function. Different SIMD versions may have different outputs. So it must
+// pair with the corresponding version of SuperResFunc.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+// |coefficients| is the upscale filter used by each pixel in a row.
+using SuperResCoefficientsFunc = void (*)(int upscaled_width,
+ int initial_subpixel_x, int step,
+ void* coefficients);
+
+// Upscaling process function signature. Section 7.16.
+// |coefficients| is the upscale filter used by each pixel in a row. It is not
+// used by the C function.
+// |source| is the input frame buffer. It will be line extended.
+// |dest| is the output buffer.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |height| is the height of the block to be processed.
+// |downscaled_width| is the width of the input frame.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+using SuperResFunc = void (*)(const void* coefficients, void* source,
+ ptrdiff_t stride, int height,
+ int downscaled_width, int upscaled_width,
+ int initial_subpixel_x, int step, void* dest);
+
+// Loop restoration function signature. Sections 7.16, 7.17.
+// |restoration_info| contains loop restoration information, such as filter
+// type, strength.
+// |source| is the input frame buffer, which is deblocked and cdef filtered.
+// |top_border| and |bottom_border| are the top and bottom borders.
+// |dest| is the output.
+// |stride| is given in pixels, and shared by |source|, |top_border|,
+// |bottom_border| and |dest|.
+// |restoration_buffer| contains buffers required for self guided filter and
+// wiener filter. They must be initialized before calling.
+using LoopRestorationFunc = void (*)(
+ const RestorationUnitInfo& restoration_info, const void* source,
+ const void* top_border, const void* bottom_border, ptrdiff_t stride,
+ int width, int height, RestorationBuffer* restoration_buffer, void* dest);
+
+// Index 0 is Wiener Filter.
+// Index 1 is Self Guided Restoration Filter.
+// This can be accessed as LoopRestorationType - 2.
+using LoopRestorationFuncs = LoopRestorationFunc[2];
+
+// Convolve function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
+ int horizontal_filter_index,
+ int vertical_filter_index,
+ int horizontal_filter_id, int vertical_filter_id,
+ int width, int height, void* prediction,
+ ptrdiff_t pred_stride);
+
+// Convolve functions signature. Each points to one convolve function with
+// a specific setting:
+// ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter]
+// [has_horizontal_filter].
+// If is_compound is false, the prediction is clipped to Pixel.
+// If is_compound is true, the range of prediction is:
+// 8bpp: [-5132, 9212] (int16_t)
+// 10bpp: [ 3988, 61532] (uint16_t)
+// 12bpp: [ 3974, 61559] (uint16_t)
+// See src/dsp/convolve.cc
+using ConvolveFuncs = ConvolveFunc[2][2][2][2];
+
+// Convolve + scale function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
+// |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+using ConvolveScaleFunc = void (*)(const void* reference,
+ ptrdiff_t reference_stride,
+ int horizontal_filter_index,
+ int vertical_filter_index, int subpixel_x,
+ int subpixel_y, int step_x, int step_y,
+ int width, int height, void* prediction,
+ ptrdiff_t pred_stride);
+
+// Convolve functions signature for scaling version.
+// 0: single predictor. 1: compound predictor.
+using ConvolveScaleFuncs = ConvolveScaleFunc[2];
+
+// Weight mask function signature. Section 7.11.3.12.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the prediction width and height.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |mask| is the output buffer. |mask_stride| is the output buffer stride.
+using WeightMaskFunc = void (*)(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride);
+
+// Weight mask functions signature. The dimensions (in order) are:
+// * Width index (4 => 0, 8 => 1, 16 => 2 and so on).
+// * Height index (4 => 0, 8 => 1, 16 => 2 and so on).
+// * mask_is_inverse.
+using WeightMaskFuncs = WeightMaskFunc[6][6][2];
+
+// Average blending function signature.
+// Two predictors are averaged to generate the output.
+// Input predictor values are int16_t. Output type is uint8_t, with actual
+// range of Pixel value.
+// Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE).
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+using AverageBlendFunc = void (*)(const void* prediction_0,
+ const void* prediction_1, int width,
+ int height, void* dest,
+ ptrdiff_t dest_stride);
+
+// Distance weighted blending function signature.
+// Weights are generated in Section 7.11.3.15.
+// Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE).
+// This function takes two blocks (inter frame prediction) and produces a
+// weighted output.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |weight_0| is the weight for the first block. It is derived from the relative
+// distance of the first reference frame and the current frame.
+// |weight_1| is the weight for the second block. It is derived from the
+// relative distance of the second reference frame and the current frame.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
+ const void* prediction_1,
+ uint8_t weight_0, uint8_t weight_1,
+ int width, int height, void* dest,
+ ptrdiff_t dest_stride);
+
+// Mask blending function signature. Section 7.11.3.14.
+// This function takes two blocks and produces a blended output stored into the
+// output block |dest|. The blending is a weighted average process, controlled
+// by values of the mask.
+// |prediction_0| is the first input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the inter frame prediction. It is
+// int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// The stride for |prediction_0| is equal to |width|.
+// |prediction_1| is the second input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the intra frame prediction and uses
+// Pixel values. It is only used for intra frame prediction when bitdepth >= 10.
+// It is int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// |prediction_stride_1| is the stride, given in units of [u]int16_t. When
+// |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is
+// equal to |width|.
+// |mask| is an integer array, whose value indicates the weight of the blending.
+// |mask_stride| is corresponding stride.
+// |width|, |height| are the same for both input blocks.
+// If it's inter_intra (or wedge_inter_intra), the valid range of block size is
+// [8x8, 32x32]. Otherwise (including difference weighted prediction and
+// compound average prediction), the valid range is [8x8, 128x128].
+// If there's subsampling, the corresponding width and height are halved for
+// chroma planes.
+// |subsampling_x|, |subsampling_y| are the subsampling factors.
+// |is_inter_intra| stands for the prediction mode. If it is true, one of the
+// prediction blocks is from intra prediction of current frame. Otherwise, two
+// prediction blocks are both inter frame predictions.
+// |is_wedge_inter_intra| indicates if the mask is for the wedge prediction.
+// |dest| is the output block.
+// |dest_stride| is the corresponding stride for dest.
+using MaskBlendFunc = void (*)(const void* prediction_0,
+ const void* prediction_1,
+ ptrdiff_t prediction_stride_1,
+ const uint8_t* mask, ptrdiff_t mask_stride,
+ int width, int height, void* dest,
+ ptrdiff_t dest_stride);
+
+// Mask blending functions signature. Each points to one function with
+// a specific setting:
+// MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra].
+using MaskBlendFuncs = MaskBlendFunc[3][2];
+
+// This function is similar to the MaskBlendFunc. It is only used when
+// |is_inter_intra| is true and |bitdepth| == 8.
+// |prediction_[01]| are Pixel values (uint8_t).
+// |prediction_1| is also the output buffer.
+using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0,
+ uint8_t* prediction_1,
+ ptrdiff_t prediction_stride_1,
+ const uint8_t* mask,
+ ptrdiff_t mask_stride, int width,
+ int height);
+
+// InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra
+// is false, the function at index 0 must be used. Otherwise, the function at
+// index subsampling_x + subsampling_y must be used.
+using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3];
+
+// Obmc (overlapped block motion compensation) blending function signature.
+// Section 7.11.3.10.
+// This function takes two blocks and produces a blended output stored into the
+// first input block. The blending is a weighted average process, controlled by
+// values of the mask.
+// Obmc is not a compound mode. It is different from other compound blending,
+// in terms of precision. The current block is computed using convolution with
+// clipping to the range of pixel values. Its above and left blocks are also
+// clipped. Therefore obmc blending process doesn't need to clip the output.
+// |prediction| is the first input block, which will be overwritten.
+// |prediction_stride| is the stride, given in bytes.
+// |width|, |height| are the same for both input blocks.
+// |obmc_prediction| is the second input block.
+// |obmc_prediction_stride| is its stride, given in bytes.
+using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride,
+ int width, int height,
+ const void* obmc_prediction,
+ ptrdiff_t obmc_prediction_stride);
+using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections];
+
+// Warp function signature. Section 7.11.3.5.
+// This function applies warp filtering for each 8x8 block inside the current
+// coding block. The filtering process is similar to 2d convolve filtering.
+// The horizontal filter is applied followed by the vertical filter.
+// The function has to calculate corresponding pixel positions before and
+// after warping.
+// |source| is the input reference frame buffer.
+// |source_stride|, |source_width|, |source_height| are corresponding frame
+// stride, width, and height. |source_stride| is given in bytes.
+// |warp_params| is the matrix of warp motion: warp_params[i] = mN.
+// [x' (m2 m3 m0 [x
+// z . y' = m4 m5 m1 * y
+// 1] m6 m7 1) 1]
+// |subsampling_x/y| is the current frame's plane subsampling factor.
+// |block_start_x| and |block_start_y| are the starting position the current
+// coding block.
+// |block_width| and |block_height| are width and height of the current coding
+// block. |block_width| and |block_height| are at least 8.
+// |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the
+// comments in the definition of struct GlobalMotion for the range of their
+// values.
+// |dest| is the output buffer of type Pixel. The output values are clipped to
+// Pixel values.
+// |dest_stride| is the stride, in units of bytes.
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsVertical &
+// kInterRoundBitsVertical12bpp will be used.
+//
+// NOTE: WarpFunc assumes the source frame has left, right, top, and bottom
+// borders that extend the frame boundary pixels.
+// * The left and right borders must be at least 13 pixels wide. In addition,
+// Warp_NEON() may read up to 14 bytes after a row in the |source| buffer.
+// Therefore, there must be at least one extra padding byte after the right
+// border of the last row in the source buffer.
+// * The top and bottom borders must be at least 13 pixels high.
+using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride,
+ int source_width, int source_height,
+ const int* warp_params, int subsampling_x,
+ int subsampling_y, int block_start_x,
+ int block_start_y, int block_width, int block_height,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta, void* dest, ptrdiff_t dest_stride);
+
+// Warp for compound predictions. Section 7.11.3.5.
+// Similar to WarpFunc, but |dest| is a uint16_t predictor buffer,
+// |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical|
+// is always 7 (kCompoundInterRoundBitsVertical).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsCompondVertical will be used.
+using WarpCompoundFunc = WarpFunc;
+
+constexpr int kNumAutoRegressionLags = 4;
+// Applies an auto-regressive filter to the white noise in |luma_grain_buffer|.
+// Section 7.18.3.3, second code block
+// |params| are parameters read from frame header, mainly providing
+// auto_regression_coeff_y for the filter and auto_regression_shift to right
+// shift the filter sum by. Note: This method assumes
+// params.auto_regression_coeff_lag is not 0. Do not call this method if
+// params.auto_regression_coeff_lag is 0.
+using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+ void* luma_grain_buffer);
+// Function index is auto_regression_coeff_lag - 1.
+using LumaAutoRegressionFuncs =
+ LumaAutoRegressionFunc[kNumAutoRegressionLags - 1];
+
+// Applies an auto-regressive filter to the white noise in u_grain and v_grain.
+// Section 7.18.3.3, third code block
+// The |luma_grain_buffer| provides samples that are added to the autoregressive
+// sum when num_y_points > 0.
+// |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise
+// that were generated from the stored Gaussian sequence, and are overwritten
+// with the results of the autoregressive filter. |params| are parameters read
+// from frame header, mainly providing auto_regression_coeff_u and
+// auto_regression_coeff_v for each chroma plane's filter, and
+// auto_regression_shift to right shift the filter sums by.
+using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+ const void* luma_grain_buffer,
+ int subsampling_x, int subsampling_y,
+ void* u_grain_buffer,
+ void* v_grain_buffer);
+using ChromaAutoRegressionFuncs =
+ ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags];
+
+// Build an image-wide "stripe" of grain noise for every 32 rows in the image.
+// Section 7.18.3.5, first code block.
+// Each 32x32 luma block is copied at a random offset specified via
+// |grain_seed| from the grain template produced by autoregression, and the same
+// is done for chroma grains, subject to subsampling.
+// |width| and |height| are the dimensions of the overall image.
+// |noise_stripes_buffer| points to an Array2DView with one row for each stripe.
+// Because this function treats all planes identically and independently, it is
+// simplified to take one grain buffer at a time. This means duplicating some
+// random number generations, but that work can be reduced in other ways.
+using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer,
+ int grain_seed, int width,
+ int height, int subsampling_x,
+ int subsampling_y,
+ void* noise_stripes_buffer);
+using ConstructNoiseStripesFuncs =
+ ConstructNoiseStripesFunc[/*overlap_flag*/ 2];
+
+// Compute the one or two overlap rows for each stripe copied to the noise
+// image.
+// Section 7.18.3.5, second code block. |width| and |height| are the
+// dimensions of the overall image. |noise_stripes_buffer| points to an
+// Array2DView with one row for each stripe. |noise_image_buffer| points to an
+// Array2D containing the allocated plane for this frame. Because this function
+// treats all planes identically and independently, it is simplified to take one
+// grain buffer at a time.
+using ConstructNoiseImageOverlapFunc =
+ void (*)(const void* noise_stripes_buffer, int width, int height,
+ int subsampling_x, int subsampling_y, void* noise_image_buffer);
+
+// Populate a scaling lookup table with interpolated values of a piecewise
+// linear function where values in |point_value| are mapped to the values in
+// |point_scaling|.
+// |num_points| can be between 0 and 15. When 0, the lookup table is set to
+// zero.
+// |point_value| and |point_scaling| have |num_points| valid elements.
+using InitializeScalingLutFunc = void (*)(
+ int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
+ uint8_t scaling_lut[kScalingLookupTableSize]);
+
+// Blend noise with image. Section 7.18.3.5, third code block.
+// |width| is the width of each row, while |height| is how many rows to compute.
+// |start_height| is an offset for the noise image, to support multithreading.
+// |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these
+// functions, according to the code in the spec.
+// |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded
+// frame. They are blended with the film grain noise and written to
+// |dest_plane_y| and |dest_plane_uv| as final output for display.
+// source_plane_* and dest_plane_* may point to the same buffer, in which case
+// the film grain noise is added in place.
+// |scaling_lut_y| and |scaling_lut| represent a piecewise linear mapping from
+// the frame's raw pixel value, to a scaling factor for the noise sample.
+// |scaling_shift| is applied as a right shift after scaling, so that scaling
+// down is possible. It is found in FilmGrainParams, but supplied directly to
+// BlendNoiseWithImageLumaFunc because it's the only member used.
+using BlendNoiseWithImageLumaFunc =
+ void (*)(const void* noise_image_ptr, int min_value, int max_value,
+ int scaling_shift, int width, int height, int start_height,
+ const uint8_t scaling_lut_y[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ void* dest_plane_y, ptrdiff_t dest_stride_y);
+
+using BlendNoiseWithImageChromaFunc = void (*)(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_value, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv);
+
+using BlendNoiseWithImageChromaFuncs =
+ BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2];
+
+//------------------------------------------------------------------------------
+
+struct FilmGrainFuncs {
+ LumaAutoRegressionFuncs luma_auto_regression;
+ ChromaAutoRegressionFuncs chroma_auto_regression;
+ ConstructNoiseStripesFuncs construct_noise_stripes;
+ ConstructNoiseImageOverlapFunc construct_noise_image_overlap;
+ InitializeScalingLutFunc initialize_scaling_lut;
+ BlendNoiseWithImageLumaFunc blend_noise_luma;
+ BlendNoiseWithImageChromaFuncs blend_noise_chroma;
+};
+
+// Motion field projection function signature. Section 7.9.
+// |reference_info| provides reference information for motion field projection.
+// |reference_to_current_with_sign| is the precalculated reference frame id
+// distance from current frame.
+// |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
+// |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile.
+// |x8_start| and |x8_end| are the start and end 8x8 columns of the current
+// tile.
+// |motion_field| is the output which saves the projected motion field
+// information.
+using MotionFieldProjectionKernelFunc = void (*)(
+ const ReferenceInfo& reference_info, int reference_to_current_with_sign,
+ int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
+ TemporalMotionField* motion_field);
+
+// Compound temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offsets| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the set of projected motion vectors.
+using MvProjectionCompoundFunc = void (*)(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], int count,
+ CompoundMotionVector* candidate_mvs);
+
+// Single temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offset| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the set of projected motion vectors.
+using MvProjectionSingleFunc = void (*)(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ int reference_offset, int count, MotionVector* candidate_mvs);
+
+struct Dsp {
+ AverageBlendFunc average_blend;
+ CdefDirectionFunc cdef_direction;
+ CdefFilteringFuncs cdef_filters;
+ CflIntraPredictorFuncs cfl_intra_predictors;
+ CflSubsamplerFuncs cfl_subsamplers;
+ ConvolveFuncs convolve;
+ ConvolveScaleFuncs convolve_scale;
+ DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
+ DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
+ DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
+ DistanceWeightedBlendFunc distance_weighted_blend;
+ FilmGrainFuncs film_grain;
+ FilterIntraPredictorFunc filter_intra_predictor;
+ InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
+ IntraEdgeFilterFunc intra_edge_filter;
+ IntraEdgeUpsamplerFunc intra_edge_upsampler;
+ IntraPredictorFuncs intra_predictors;
+ InverseTransformAddFuncs inverse_transforms;
+ LoopFilterFuncs loop_filters;
+ LoopRestorationFuncs loop_restorations;
+ MaskBlendFuncs mask_blend;
+ MotionFieldProjectionKernelFunc motion_field_projection_kernel;
+ MvProjectionCompoundFunc mv_projection_compound[3];
+ MvProjectionSingleFunc mv_projection_single[3];
+ ObmcBlendFuncs obmc_blend;
+ SuperResCoefficientsFunc super_res_coefficients;
+ SuperResFunc super_res;
+ WarpCompoundFunc warp_compound;
+ WarpFunc warp;
+ WeightMaskFuncs weight_mask;
+};
+
+// Initializes function pointers based on build config and runtime
+// environment. Must be called once before first use. This function is
+// thread-safe.
+void DspInit();
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist.
+const Dsp* GetDspTable(int bitdepth);
+
+} // namespace dsp
+
+namespace dsp_internal {
+
+// Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
+// functions if /arch:AVX2 is used across all sources.
+#if !LIBGAV1_TARGETING_AVX2 && \
+ (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
+#undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
+#endif
+
+// Returns true if a more highly optimized version of |func| is not defined for
+// the associated bitdepth or if it is forcibly enabled with
+// LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
+// to the LIBGAV1_Dsp<bitdepth>bpp_|func| define in the header file associated
+// with the module.
+// |func| is one of:
+// - FunctionName, e.g., SelfGuidedFilter.
+// - [sub-table-index1][...-indexN] e.g.,
+// TransformSize4x4_IntraPredictorDc. The indices correspond to enum values
+// used as lookups with leading 'k' removed.
+//
+// NEON support is the only extension available for ARM and it is always
+// required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
+// true and can be omitted.
+#define DSP_ENABLED_8BPP_AVX2(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_10BPP_AVX2(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_8BPP_SSE4_1(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
+#define DSP_ENABLED_10BPP_SSE4_1(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist. This version is meant for use by test or dsp/*Init() functions only.
+dsp::Dsp* GetWritableDspTable(int bitdepth);
+
+} // namespace dsp_internal
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_DSP_H_
diff --git a/src/dsp/film_grain.cc b/src/dsp/film_grain.cc
new file mode 100644
index 0000000..41d1dd0
--- /dev/null
+++ b/src/dsp/film_grain.cc
@@ -0,0 +1,870 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Making this a template function prevents it from adding to code size when it
+// is not placed in the DSP table. Most functions in the dsp directory change
+// behavior by bitdepth, but because this one doesn't, it receives a dummy
+// parameter with one enforced value, ensuring only one copy is made.
+template <int singleton>
+void InitializeScalingLookupTable_C(
+ int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
+ uint8_t scaling_lut[kScalingLookupTableSize]) {
+ static_assert(singleton == 0,
+ "Improper instantiation of InitializeScalingLookupTable_C. "
+ "There should be only one copy of this function.");
+ if (num_points == 0) {
+ memset(scaling_lut, 0, sizeof(scaling_lut[0]) * kScalingLookupTableSize);
+ return;
+ }
+ static_assert(sizeof(scaling_lut[0]) == 1, "");
+ memset(scaling_lut, point_scaling[0], point_value[0]);
+ for (int i = 0; i < num_points - 1; ++i) {
+ const int delta_y = point_scaling[i + 1] - point_scaling[i];
+ const int delta_x = point_value[i + 1] - point_value[i];
+ const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+ for (int x = 0; x < delta_x; ++x) {
+ const int v = point_scaling[i] + ((x * delta + 32768) >> 16);
+ assert(v >= 0 && v <= UINT8_MAX);
+ scaling_lut[point_value[i] + x] = v;
+ }
+ }
+ const uint8_t last_point_value = point_value[num_points - 1];
+ memset(&scaling_lut[last_point_value], point_scaling[num_points - 1],
+ kScalingLookupTableSize - last_point_value);
+}
+
+// Section 7.18.3.5.
+// Performs a piecewise linear interpolation into the scaling table.
+template <int bitdepth>
+int ScaleLut(const uint8_t scaling_lut[kScalingLookupTableSize], int index) {
+ const int shift = bitdepth - 8;
+ const int quotient = index >> shift;
+ const int remainder = index - (quotient << shift);
+ if (bitdepth == 8) {
+ assert(quotient < kScalingLookupTableSize);
+ return scaling_lut[quotient];
+ }
+ assert(quotient + 1 < kScalingLookupTableSize);
+ const int start = scaling_lut[quotient];
+ const int end = scaling_lut[quotient + 1];
+ return start + RightShiftWithRounding((end - start) * remainder, shift);
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType>
+void ApplyAutoRegressiveFilterToLumaGrain_C(const FilmGrainParams& params,
+ void* luma_grain_buffer) {
+ auto* luma_grain = static_cast<GrainType*>(luma_grain_buffer);
+ const int grain_min = GetGrainMin<bitdepth>();
+ const int grain_max = GetGrainMax<bitdepth>();
+ const int auto_regression_coeff_lag = params.auto_regression_coeff_lag;
+ assert(auto_regression_coeff_lag > 0 && auto_regression_coeff_lag <= 3);
+ // A pictorial representation of the auto-regressive filter for various values
+ // of auto_regression_coeff_lag. The letter 'O' represents the current sample.
+ // (The filter always operates on the current sample with filter
+ // coefficient 1.) The letters 'X' represent the neighboring samples that the
+ // filter operates on.
+ //
+ // auto_regression_coeff_lag == 3:
+ // X X X X X X X
+ // X X X X X X X
+ // X X X X X X X
+ // X X X O
+ // auto_regression_coeff_lag == 2:
+ // X X X X X
+ // X X X X X
+ // X X O
+ // auto_regression_coeff_lag == 1:
+ // X X X
+ // X O
+ // auto_regression_coeff_lag == 0:
+ // O
+ //
+ // Note that if auto_regression_coeff_lag is 0, the filter is the identity
+ // filter and therefore can be skipped. This implementation assumes it is not
+ // called in that case.
+ const int shift = params.auto_regression_shift;
+ for (int y = kAutoRegressionBorder; y < kLumaHeight; ++y) {
+ for (int x = kAutoRegressionBorder; x < kLumaWidth - kAutoRegressionBorder;
+ ++x) {
+ int sum = 0;
+ int pos = 0;
+ int delta_row = -auto_regression_coeff_lag;
+ // The last iteration (delta_row == 0) is shorter and is handled
+ // separately.
+ do {
+ int delta_column = -auto_regression_coeff_lag;
+ do {
+ const int coeff = params.auto_regression_coeff_y[pos];
+ sum += luma_grain[(y + delta_row) * kLumaWidth + (x + delta_column)] *
+ coeff;
+ ++pos;
+ } while (++delta_column <= auto_regression_coeff_lag);
+ } while (++delta_row < 0);
+ // Last iteration: delta_row == 0.
+ {
+ int delta_column = -auto_regression_coeff_lag;
+ do {
+ const int coeff = params.auto_regression_coeff_y[pos];
+ sum += luma_grain[y * kLumaWidth + (x + delta_column)] * coeff;
+ ++pos;
+ } while (++delta_column < 0);
+ }
+ luma_grain[y * kLumaWidth + x] = Clip3(
+ luma_grain[y * kLumaWidth + x] + RightShiftWithRounding(sum, shift),
+ grain_min, grain_max);
+ }
+ }
+}
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+ bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_C(const FilmGrainParams& params,
+ const void* luma_grain_buffer,
+ int subsampling_x,
+ int subsampling_y,
+ void* u_grain_buffer,
+ void* v_grain_buffer) {
+ static_assert(
+ auto_regression_coeff_lag >= 0 && auto_regression_coeff_lag <= 3,
+ "Unsupported autoregression lag for chroma.");
+ const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+ const int grain_min = GetGrainMin<bitdepth>();
+ const int grain_max = GetGrainMax<bitdepth>();
+ auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+ auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+ const int shift = params.auto_regression_shift;
+ const int chroma_height =
+ (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+ const int chroma_width =
+ (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+ for (int y = kAutoRegressionBorder; y < chroma_height; ++y) {
+ const int luma_y =
+ ((y - kAutoRegressionBorder) << subsampling_y) + kAutoRegressionBorder;
+ for (int x = kAutoRegressionBorder;
+ x < chroma_width - kAutoRegressionBorder; ++x) {
+ int sum_u = 0;
+ int sum_v = 0;
+ int pos = 0;
+ int delta_row = -auto_regression_coeff_lag;
+ do {
+ int delta_column = -auto_regression_coeff_lag;
+ do {
+ if (delta_row == 0 && delta_column == 0) {
+ break;
+ }
+ const int coeff_u = params.auto_regression_coeff_u[pos];
+ const int coeff_v = params.auto_regression_coeff_v[pos];
+ sum_u +=
+ u_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+ coeff_u;
+ sum_v +=
+ v_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+ coeff_v;
+ ++pos;
+ } while (++delta_column <= auto_regression_coeff_lag);
+ } while (++delta_row <= 0);
+ if (use_luma) {
+ int luma = 0;
+ const int luma_x = ((x - kAutoRegressionBorder) << subsampling_x) +
+ kAutoRegressionBorder;
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ luma += luma_grain[(luma_y + i) * kLumaWidth + (luma_x + j)];
+ } while (++j <= subsampling_x);
+ } while (++i <= subsampling_y);
+ luma = SubsampledValue(luma, subsampling_x + subsampling_y);
+ const int coeff_u = params.auto_regression_coeff_u[pos];
+ const int coeff_v = params.auto_regression_coeff_v[pos];
+ sum_u += luma * coeff_u;
+ sum_v += luma * coeff_v;
+ }
+ u_grain[y * chroma_width + x] = Clip3(
+ u_grain[y * chroma_width + x] + RightShiftWithRounding(sum_u, shift),
+ grain_min, grain_max);
+ v_grain[y * chroma_width + x] = Clip3(
+ v_grain[y * chroma_width + x] + RightShiftWithRounding(sum_v, shift),
+ grain_min, grain_max);
+ }
+ }
+}
+
+// This implementation is for the condition overlap_flag == false.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripes_C(const void* grain_buffer, int grain_seed,
+ int width, int height, int subsampling_x,
+ int subsampling_y, void* noise_stripes_buffer) {
+ auto* noise_stripes =
+ static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+ const auto* grain = static_cast<const GrainType*>(grain_buffer);
+ const int half_width = DivideBy2(width + 1);
+ const int half_height = DivideBy2(height + 1);
+ assert(half_width > 0);
+ assert(half_height > 0);
+ static_assert(kLumaWidth == kMaxChromaWidth,
+ "kLumaWidth width should be equal to kMaxChromaWidth");
+ const int grain_width =
+ (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ constexpr int kNoiseStripeHeight = 34;
+ int luma_num = 0;
+ int y = 0;
+ do {
+ GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+ uint16_t seed = grain_seed;
+ seed ^= ((luma_num * 37 + 178) & 255) << 8;
+ seed ^= ((luma_num * 173 + 105) & 255);
+ int x = 0;
+ do {
+ const int rand = GetFilmGrainRandomNumber(8, &seed);
+ const int offset_x = rand >> 4;
+ const int offset_y = rand & 15;
+ const int plane_offset_x =
+ (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+ const int plane_offset_y =
+ (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+ int i = 0;
+ do {
+ // Section 7.18.3.5 says:
+ // noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+ // wide (a few additional samples across are actually written to
+ // the array, but these are never read) ...
+ //
+ // Note: The warning in the parentheses also applies to
+ // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+ //
+ // Writes beyond the width of each row could happen below. To
+ // prevent those writes, we clip the number of pixels to copy against
+ // the remaining width.
+ // TODO(petersonab): Allocate aligned stripes with extra width to cover
+ // the size of the final stripe block, then remove this call to min.
+ const int copy_size =
+ std::min(kNoiseStripeHeight >> subsampling_x,
+ plane_width - (x << (1 - subsampling_x)));
+ memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x))],
+ &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+ copy_size * sizeof(noise_stripe[0]));
+ } while (++i < (kNoiseStripeHeight >> subsampling_y));
+ x += 16;
+ } while (x < half_width);
+
+ ++luma_num;
+ y += 16;
+ } while (y < half_height);
+}
+
+// This implementation is for the condition overlap_flag == true.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripesWithOverlap_C(const void* grain_buffer,
+ int grain_seed, int width, int height,
+ int subsampling_x, int subsampling_y,
+ void* noise_stripes_buffer) {
+ auto* noise_stripes =
+ static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+ const auto* grain = static_cast<const GrainType*>(grain_buffer);
+ const int half_width = DivideBy2(width + 1);
+ const int half_height = DivideBy2(height + 1);
+ assert(half_width > 0);
+ assert(half_height > 0);
+ static_assert(kLumaWidth == kMaxChromaWidth,
+ "kLumaWidth width should be equal to kMaxChromaWidth");
+ const int grain_width =
+ (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ constexpr int kNoiseStripeHeight = 34;
+ int luma_num = 0;
+ int y = 0;
+ do {
+ GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+ uint16_t seed = grain_seed;
+ seed ^= ((luma_num * 37 + 178) & 255) << 8;
+ seed ^= ((luma_num * 173 + 105) & 255);
+ // Begin special iteration for x == 0.
+ const int rand = GetFilmGrainRandomNumber(8, &seed);
+ const int offset_x = rand >> 4;
+ const int offset_y = rand & 15;
+ const int plane_offset_x =
+ (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+ const int plane_offset_y =
+ (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+ // The overlap computation only occurs when x > 0, so it is omitted here.
+ int i = 0;
+ do {
+ // TODO(petersonab): Allocate aligned stripes with extra width to cover
+ // the size of the final stripe block, then remove this call to min.
+ const int copy_size =
+ std::min(kNoiseStripeHeight >> subsampling_x, plane_width);
+ memcpy(&noise_stripe[i * plane_width],
+ &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+ copy_size * sizeof(noise_stripe[0]));
+ } while (++i < (kNoiseStripeHeight >> subsampling_y));
+ // End special iteration for x == 0.
+ for (int x = 16; x < half_width; x += 16) {
+ const int rand = GetFilmGrainRandomNumber(8, &seed);
+ const int offset_x = rand >> 4;
+ const int offset_y = rand & 15;
+ const int plane_offset_x =
+ (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+ const int plane_offset_y =
+ (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+ int i = 0;
+ do {
+ int j = 0;
+ int grain_sample =
+ grain[(plane_offset_y + i) * grain_width + plane_offset_x];
+ // The first pixel(s) of each segment of the noise_stripe are subject to
+ // the "overlap" computation.
+ if (subsampling_x == 0) {
+ // Corresponds to the line in the spec:
+ // if (j < 2 && x > 0)
+ // j = 0
+ int old = noise_stripe[i * plane_width + x * 2];
+ grain_sample = old * 27 + grain_sample * 17;
+ grain_sample =
+ Clip3(RightShiftWithRounding(grain_sample, 5),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+ noise_stripe[i * plane_width + x * 2] = grain_sample;
+
+ // This check prevents overwriting for the iteration j = 1. The
+ // continue applies to the i-loop.
+ if (x * 2 + 1 >= plane_width) continue;
+ // j = 1
+ grain_sample =
+ grain[(plane_offset_y + i) * grain_width + plane_offset_x + 1];
+ old = noise_stripe[i * plane_width + x * 2 + 1];
+ grain_sample = old * 17 + grain_sample * 27;
+ grain_sample =
+ Clip3(RightShiftWithRounding(grain_sample, 5),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+ noise_stripe[i * plane_width + x * 2 + 1] = grain_sample;
+ j = 2;
+ } else {
+ // Corresponds to the line in the spec:
+ // if (j == 0 && x > 0)
+ const int old = noise_stripe[i * plane_width + x];
+ grain_sample = old * 23 + grain_sample * 22;
+ grain_sample =
+ Clip3(RightShiftWithRounding(grain_sample, 5),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+ noise_stripe[i * plane_width + x] = grain_sample;
+ j = 1;
+ }
+ // The following covers the rest of the loop over j as described in the
+ // spec.
+ //
+ // Section 7.18.3.5 says:
+ // noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+ // wide (a few additional samples across are actually written to
+ // the array, but these are never read) ...
+ //
+ // Note: The warning in the parentheses also applies to
+ // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+ //
+ // Writes beyond the width of each row could happen below. To
+ // prevent those writes, we clip the number of pixels to copy against
+ // the remaining width.
+ // TODO(petersonab): Allocate aligned stripes with extra width to cover
+ // the size of the final stripe block, then remove this call to min.
+ const int copy_size =
+ std::min(kNoiseStripeHeight >> subsampling_x,
+ plane_width - (x << (1 - subsampling_x))) -
+ j;
+ memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x)) + j],
+ &grain[(plane_offset_y + i) * grain_width + plane_offset_x + j],
+ copy_size * sizeof(noise_stripe[0]));
+ } while (++i < (kNoiseStripeHeight >> subsampling_y));
+ }
+
+ ++luma_num;
+ y += 16;
+ } while (y < half_height);
+}
+
+template <int bitdepth, typename GrainType>
+inline void WriteOverlapLine_C(const GrainType* noise_stripe_row,
+ const GrainType* noise_stripe_row_prev,
+ int plane_width, int grain_coeff, int old_coeff,
+ GrainType* noise_image_row) {
+ int x = 0;
+ do {
+ int grain = noise_stripe_row[x];
+ const int old = noise_stripe_row_prev[x];
+ grain = old * old_coeff + grain * grain_coeff;
+ grain = Clip3(RightShiftWithRounding(grain, 5), GetGrainMin<bitdepth>(),
+ GetGrainMax<bitdepth>());
+ noise_image_row[x] = grain;
+ } while (++x < plane_width);
+}
+
+template <int bitdepth, typename GrainType>
+void ConstructNoiseImageOverlap_C(const void* noise_stripes_buffer, int width,
+ int height, int subsampling_x,
+ int subsampling_y, void* noise_image_buffer) {
+ const auto* noise_stripes =
+ static_cast<const Array2DView<GrainType>*>(noise_stripes_buffer);
+ auto* noise_image = static_cast<Array2D<GrainType>*>(noise_image_buffer);
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ const int plane_height = (height + subsampling_y) >> subsampling_y;
+ const int stripe_height = 32 >> subsampling_y;
+ const int stripe_mask = stripe_height - 1;
+ int y = stripe_height;
+ int luma_num = 1;
+ if (subsampling_y == 0) {
+ // Begin complete stripes section. This is when we are guaranteed to have
+ // two overlap rows in each stripe.
+ for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ // First overlap row.
+ WriteOverlapLine_C<bitdepth>(noise_stripe,
+ &noise_stripe_prev[32 * plane_width],
+ plane_width, 17, 27, (*noise_image)[y]);
+ // Second overlap row.
+ WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, 27, 17, (*noise_image)[y + 1]);
+ }
+ // End complete stripes section.
+
+ const int remaining_height = plane_height - y;
+ // Either one partial stripe remains (remaining_height > 0),
+ // OR image is less than one stripe high (remaining_height < 0),
+ // OR all stripes are completed (remaining_height == 0).
+ if (remaining_height <= 0) {
+ return;
+ }
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine_C<bitdepth>(noise_stripe,
+ &noise_stripe_prev[32 * plane_width],
+ plane_width, 17, 27, (*noise_image)[y]);
+
+ // Check if second overlap row is in the image.
+ if (remaining_height > 1) {
+ WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, 27, 17, (*noise_image)[y + 1]);
+ }
+ } else { // |subsampling_y| == 1
+ // No special checks needed for partial stripes, because if one exists, the
+ // first and only overlap row is guaranteed to exist.
+ for (; y < plane_height; ++luma_num, y += stripe_height) {
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine_C<bitdepth>(noise_stripe,
+ &noise_stripe_prev[16 * plane_width],
+ plane_width, 22, 23, (*noise_image)[y]);
+ }
+ }
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_C(
+ const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
+ int width, int height, int start_height,
+ const uint8_t scaling_lut_y[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
+ ptrdiff_t dest_stride_y) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ auto* out_y = static_cast<Pixel*>(dest_plane_y);
+ dest_stride_y /= sizeof(Pixel);
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int orig = in_y[y * source_stride_y + x];
+ int noise = noise_image[kPlaneY][y + start_height][x];
+ noise = RightShiftWithRounding(
+ ScaleLut<bitdepth>(scaling_lut_y, orig) * noise, scaling_shift);
+ out_y[y * dest_stride_y + x] = Clip3(orig + noise, min_value, max_luma);
+ } while (++x < width);
+ } while (++y < height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChroma_C(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_chroma, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut_uv[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+
+ const int scaling_shift = params.chroma_scaling;
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int luma_x = x << subsampling_x;
+ const int luma_y = y << subsampling_y;
+ const int luma_next_x = std::min(luma_x + 1, width - 1);
+ int average_luma;
+ if (subsampling_x != 0) {
+ average_luma = RightShiftWithRounding(
+ in_y[luma_y * source_stride_y + luma_x] +
+ in_y[luma_y * source_stride_y + luma_next_x],
+ 1);
+ } else {
+ average_luma = in_y[luma_y * source_stride_y + luma_x];
+ }
+ const int orig = in_uv[y * source_stride_uv + x];
+ const int combined = average_luma * luma_multiplier + orig * multiplier;
+ const int merged =
+ Clip3((combined >> 6) + LeftShift(offset, bitdepth - 8), 0,
+ (1 << bitdepth) - 1);
+ int noise = noise_image[plane][y + start_height][x];
+ noise = RightShiftWithRounding(
+ ScaleLut<bitdepth>(scaling_lut_uv, merged) * noise, scaling_shift);
+ out_uv[y * dest_stride_uv + x] =
+ Clip3(orig + noise, min_value, max_chroma);
+ } while (++x < chroma_width);
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_C(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_chroma, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int scaling_shift = params.chroma_scaling;
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int luma_x = x << subsampling_x;
+ const int luma_y = y << subsampling_y;
+ const int luma_next_x = std::min(luma_x + 1, width - 1);
+ int average_luma;
+ if (subsampling_x != 0) {
+ average_luma = RightShiftWithRounding(
+ in_y[luma_y * source_stride_y + luma_x] +
+ in_y[luma_y * source_stride_y + luma_next_x],
+ 1);
+ } else {
+ average_luma = in_y[luma_y * source_stride_y + luma_x];
+ }
+ const int orig_uv = in_uv[y * source_stride_uv + x];
+ int noise_uv = noise_image[plane][y + start_height][x];
+ noise_uv = RightShiftWithRounding(
+ ScaleLut<bitdepth>(scaling_lut, average_luma) * noise_uv,
+ scaling_shift);
+ out_uv[y * dest_stride_uv + x] =
+ Clip3(orig_uv + noise_uv, min_value, max_chroma);
+ } while (++x < chroma_width);
+ } while (++y < chroma_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+
+ // ChromaAutoRegressionFunc
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, true>;
+
+ // ConstructNoiseStripesFunc
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<8, int8_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<8, int8_t>;
+
+ // ConstructNoiseImageOverlapFunc
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<8, int8_t>;
+
+ // InitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>;
+
+ // BlendNoiseWithImageLumaFunc
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<8, int8_t, uint8_t>;
+
+ // BlendNoiseWithImageChromaFunc
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<8, int8_t, uint8_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<8, int8_t, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseStripes
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<8, int8_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<8, int8_t, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+
+ // ChromaAutoRegressionFunc
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, true>;
+
+ // ConstructNoiseStripesFunc
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<10, int16_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<10, int16_t>;
+
+ // ConstructNoiseImageOverlapFunc
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<10, int16_t>;
+
+ // InitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>;
+
+ // BlendNoiseWithImageLumaFunc
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<10, int16_t, uint16_t>;
+
+ // BlendNoiseWithImageChromaFunc
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<10, int16_t, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseStripes
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<10, int16_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<10, int16_t, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace film_grain
+
+void FilmGrainInit_C() {
+ film_grain::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ film_grain::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/film_grain.h b/src/dsp/film_grain.h
new file mode 100644
index 0000000..fe93270
--- /dev/null
+++ b/src/dsp/film_grain.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/film_grain_neon.h"
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize Dsp::film_grain_synthesis. This function is not thread-safe.
+void FilmGrainInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_FILM_GRAIN_H_
diff --git a/src/dsp/film_grain_common.h b/src/dsp/film_grain_common.h
new file mode 100644
index 0000000..64e3e8e
--- /dev/null
+++ b/src/dsp/film_grain_common.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+
+template <int bitdepth>
+int GetGrainMax() {
+ return (1 << (bitdepth - 1)) - 1;
+}
+
+template <int bitdepth>
+int GetGrainMin() {
+ return -(1 << (bitdepth - 1));
+}
+
+inline int GetFilmGrainRandomNumber(int bits, uint16_t* seed) {
+ uint16_t s = *seed;
+ uint16_t bit = (s ^ (s >> 1) ^ (s >> 3) ^ (s >> 12)) & 1;
+ s = (s >> 1) | (bit << 15);
+ *seed = s;
+ return s >> (16 - bits);
+}
+
+enum {
+ kAutoRegressionBorder = 3,
+ // The width of the luma noise array.
+ kLumaWidth = 82,
+ // The height of the luma noise array.
+ kLumaHeight = 73,
+ // The two possible widths of the chroma noise array.
+ kMinChromaWidth = 44,
+ kMaxChromaWidth = 82,
+ // The two possible heights of the chroma noise array.
+ kMinChromaHeight = 38,
+ kMaxChromaHeight = 73,
+ // The scaling lookup table maps bytes to bytes, so only uses 256 elements,
+ // plus one for overflow in 10bit lookups.
+ kScalingLookupTableSize = 257,
+ // Padding is added to the scaling lookup table to permit overwrites by
+ // InitializeScalingLookupTable_NEON.
+ kScalingLookupTablePadding = 6,
+ // Padding is added to each row of the noise image to permit overreads by
+ // BlendNoiseWithImageLuma_NEON and overwrites by WriteOverlapLine8bpp_NEON.
+ kNoiseImagePadding = 7,
+ // Padding is added to the end of the |noise_stripes_| buffer to permit
+ // overreads by WriteOverlapLine8bpp_NEON.
+ kNoiseStripePadding = 7,
+}; // anonymous enum
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
diff --git a/src/dsp/intra_edge.cc b/src/dsp/intra_edge.cc
new file mode 100644
index 0000000..fe66db2
--- /dev/null
+++ b/src/dsp/intra_edge.cc
@@ -0,0 +1,115 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+ {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxUpsampleSize = 16;
+
+template <typename Pixel>
+void IntraEdgeFilter_C(void* buffer, int size, int strength) {
+ assert(strength > 0);
+ Pixel edge[129];
+ memcpy(edge, buffer, sizeof(edge[0]) * size);
+ auto* const dst_buffer = static_cast<Pixel*>(buffer);
+ const int kernel_index = strength - 1;
+ for (int i = 1; i < size; ++i) {
+ int sum = 0;
+ for (int j = 0; j < kKernelTaps; ++j) {
+ const int k = Clip3(i + j - 2, 0, size - 1);
+ sum += kKernels[kernel_index][j] * edge[k];
+ }
+ dst_buffer[i] = RightShiftWithRounding(sum, 4);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsampler_C(void* buffer, int size) {
+ assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+ auto* const pixel_buffer = static_cast<Pixel*>(buffer);
+ Pixel temp[kMaxUpsampleSize + 3];
+ temp[0] = temp[1] = pixel_buffer[-1];
+ memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+ temp[size + 2] = pixel_buffer[size - 1];
+
+ pixel_buffer[-2] = temp[0];
+ for (int i = 0; i < size; ++i) {
+ const int sum =
+ -temp[i] + (9 * temp[i + 1]) + (9 * temp[i + 2]) - temp[i + 3];
+ pixel_buffer[2 * i - 1] =
+ Clip3(RightShiftWithRounding(sum, 4), 0, (1 << bitdepth) - 1);
+ pixel_buffer[2 * i] = temp[i + 2];
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeFilter
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeUpsampler
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void IntraEdgeInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/intra_edge.h b/src/dsp/intra_edge.h
new file mode 100644
index 0000000..172ecbb
--- /dev/null
+++ b/src/dsp/intra_edge.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+#define LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intra_edge_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intra_edge_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRA_EDGE_H_
diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc
new file mode 100644
index 0000000..4bcb580
--- /dev/null
+++ b/src/dsp/intrapred.cc
@@ -0,0 +1,2911 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring> // memset
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+ kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+ kTransformSize64x32, kTransformSize64x64};
+
+template <int block_width, int block_height, typename Pixel>
+struct IntraPredFuncs_C {
+ IntraPredFuncs_C() = delete;
+
+ static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+ const void* top_row, const void* left_column);
+};
+
+// Intra-predictors that require bitdepth.
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+struct IntraPredBppFuncs_C {
+ IntraPredBppFuncs_C() = delete;
+
+ static void DcFill(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C::DcPred
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcTop(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* /*left_column*/) {
+ int sum = block_width >> 1; // rounder
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ for (int x = 0; x < block_width; ++x) sum += top[x];
+ const int dc = sum >> FloorLog2(block_width);
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, dc, block_width);
+ dst += stride;
+ }
+}
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcLeft(
+ void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+ const void* const left_column) {
+ int sum = block_height >> 1; // rounder
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ for (int y = 0; y < block_height; ++y) sum += left[y];
+ const int dc = sum >> FloorLog2(block_height);
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, dc, block_width);
+ dst += stride;
+ }
+}
+
+// Note for square blocks the divide in the Dc() function reduces to a shift.
+// For rectangular block sizes the following multipliers can be used with the
+// corresponding shifts.
+// 8-bit
+// 1:2 (e.g,, 4x8): scale = 0x5556
+// 1:4 (e.g., 4x16): scale = 0x3334
+// final_descale = 16
+// 10/12-bit
+// 1:2: scale = 0xaaab
+// 1:4: scale = 0x6667
+// final_descale = 17
+// Note these may be halved to the values used in 8-bit in all cases except
+// when bitdepth == 12 and block_width + block_height is divisible by 5 (as
+// opposed to 3).
+//
+// The calculation becomes:
+// (dc_sum >> intermediate_descale) * scale) >> final_descale
+// where intermediate_descale is:
+// sum = block_width + block_height
+// intermediate_descale =
+// (sum <= 20) ? 2 : (sum <= 40) ? 3 : (sum <= 80) ? 4 : 5
+//
+// The constants (multiplier and shifts) for a given block size are obtained
+// as follows:
+// - Let sum = block width + block height
+// - Shift 'sum' right until we reach an odd number
+// - Let the number of shifts for that block size be called 'intermediate_scale'
+// and let the odd number be 'd' (d has only 2 possible values: d = 3 for a
+// 1:2 rectangular block and d = 5 for a 1:4 rectangular block).
+// - Find multipliers by dividing by 'd' using "Algorithm 1" in:
+// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+// by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+// shift will be 16, regardless of the block size.
+// TODO(jzern): the base implementation could be updated to use this method.
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Dc(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const int divisor = block_width + block_height;
+ int sum = divisor >> 1; // rounder
+
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ for (int x = 0; x < block_width; ++x) sum += top[x];
+ for (int y = 0; y < block_height; ++y) sum += left[y];
+
+ const int dc = sum / divisor;
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, dc, block_width);
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C directional predictors
+
+// IntraPredFuncs_C::Vertical -- apply top row vertically
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Vertical(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* /*left_column*/) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < block_height; ++y) {
+ memcpy(dst, top_row, block_width * sizeof(Pixel));
+ dst += stride;
+ }
+}
+
+// IntraPredFuncs_C::Horizontal -- apply left column horizontally
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
+ void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, left[y], block_width);
+ dst += stride;
+ }
+}
+
+template <typename Pixel>
+inline Pixel Average(Pixel a, Pixel b) {
+ return static_cast<Pixel>((a + b + 1) >> 1);
+}
+
+template <typename Pixel>
+inline Pixel Average(Pixel a, Pixel b, Pixel c) {
+ return static_cast<Pixel>((a + 2 * b + c + 2) >> 2);
+}
+
+// IntraPredFuncs_C::Paeth
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_left = top[-1];
+ const int top_left_x2 = top_left + top_left;
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ const int left_pixel = left[y];
+ for (int x = 0; x < block_width; ++x) {
+ // The Paeth filter selects the value closest to:
+ // top[x] + left[y] - top_left
+ // To calculate the absolute distance for the left value this would be:
+ // abs((top[x] + left[y] - top_left) - left[y])
+ // or, because left[y] cancels out:
+ // abs(top[x] - top_left)
+ const int left_dist = std::abs(top[x] - top_left);
+ const int top_dist = std::abs(left_pixel - top_left);
+ const int top_left_dist = std::abs(top[x] + left_pixel - top_left_x2);
+
+ // Select the closest value to the initial estimate of 'T + L - TL'.
+ if (left_dist <= top_dist && left_dist <= top_left_dist) {
+ dst[x] = left_pixel;
+ } else if (top_dist <= top_left_dist) {
+ dst[x] = top[x];
+ } else {
+ dst[x] = top_left;
+ }
+ }
+ dst += stride;
+ }
+}
+
+constexpr uint8_t kSmoothWeights[] = {
+ // block dimension = 4
+ 255, 149, 85, 64,
+ // block dimension = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // block dimension = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // block dimension = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // block dimension = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+ 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+ 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+// IntraPredFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Smooth(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_right = top[block_width - 1];
+ const Pixel bottom_left = left[block_height - 1];
+ static_assert(
+ block_width >= 4 && block_height >= 4,
+ "Weights for smooth predictor undefined for block width/height < 4");
+ const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+ const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+ uint32_t pred = weights_y[y] * top[x];
+ pred += weights_x[x] * left[y];
+ pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+ pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+ // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+ // + 256. With the descale there's no need for saturation.
+ dst[x] = static_cast<Pixel>(
+ RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+ }
+ dst += stride;
+ }
+}
+
+// IntraPredFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel bottom_left = left[block_height - 1];
+ static_assert(block_height >= 4,
+ "Weights for smooth predictor undefined for block height < 4");
+ const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_y[y]);
+ uint32_t pred = weights_y[y] * top[x];
+ pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+ dst[x] =
+ static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+ }
+ dst += stride;
+ }
+}
+
+// IntraPredFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_right = top[block_width - 1];
+ static_assert(block_width >= 4,
+ "Weights for smooth predictor undefined for block width < 4");
+ const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_x[x]);
+ uint32_t pred = weights_x[x] * left[y];
+ pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+ dst[x] =
+ static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+ }
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredBppFuncs_C
+template <int fill, typename Pixel>
+inline void DcFill_C(void* const dest, ptrdiff_t stride, const int block_width,
+ const int block_height) {
+ static_assert(sizeof(Pixel) == 1 || sizeof(Pixel) == 2,
+ "Only 1 & 2 byte pixels are supported");
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, fill, block_width);
+ dst += stride;
+ }
+}
+
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void IntraPredBppFuncs_C<block_width, block_height, bitdepth, Pixel>::DcFill(
+ void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+ const void* /*left_column*/) {
+ DcFill_C<0x80 << (bitdepth - 8), Pixel>(dest, stride, block_width,
+ block_height);
+}
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const FilterIntraPredictor pred, const int width,
+ const int height) {
+ const int kMaxPixel = (1 << bitdepth) - 1;
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+
+ assert(width <= 32 && height <= 32);
+
+ Pixel buffer[3][33]; // cache 2 rows + top & left boundaries
+ memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ int row0 = 0, row2 = 2;
+ int ystep = 1;
+ int y = 0;
+ do {
+ buffer[1][0] = left[y];
+ buffer[row2][0] = left[y + 1];
+ int x = 1;
+ do {
+ const Pixel p0 = buffer[row0][x - 1]; // top-left
+ const Pixel p1 = buffer[row0][x + 0]; // top 0
+ const Pixel p2 = buffer[row0][x + 1]; // top 1
+ const Pixel p3 = buffer[row0][x + 2]; // top 2
+ const Pixel p4 = buffer[row0][x + 3]; // top 3
+ const Pixel p5 = buffer[1][x - 1]; // left 0
+ const Pixel p6 = buffer[row2][x - 1]; // left 1
+ for (int i = 0; i < 8; ++i) {
+ const int xoffset = i & 0x03;
+ const int yoffset = (i >> 2) * ystep;
+ const int value = kFilterIntraTaps[pred][i][0] * p0 +
+ kFilterIntraTaps[pred][i][1] * p1 +
+ kFilterIntraTaps[pred][i][2] * p2 +
+ kFilterIntraTaps[pred][i][3] * p3 +
+ kFilterIntraTaps[pred][i][4] * p4 +
+ kFilterIntraTaps[pred][i][5] * p5 +
+ kFilterIntraTaps[pred][i][6] * p6;
+ buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+ }
+ x += 4;
+ } while (x < width);
+ memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+ dst += stride;
+
+ // The final row becomes the top for the next pass.
+ row0 ^= 2;
+ row2 ^= 2;
+ ystep = -ystep;
+ y += 2;
+ } while (y < height);
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+ void* const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<Pixel*>(dest);
+ const int dc = dst[0];
+ stride /= sizeof(Pixel);
+ const int max_value = (1 << bitdepth) - 1;
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+ assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+ dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+ 0, max_value);
+ }
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+ int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const auto* src = static_cast<const Pixel*>(source);
+ stride /= sizeof(Pixel);
+ int sum = 0;
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ const ptrdiff_t luma_x =
+ std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+ const ptrdiff_t luma_x_next = luma_x + stride;
+ luma[y][x] =
+ (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+ ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+ : 0))
+ << (3 - subsampling_x - subsampling_y);
+ sum += luma[y][x];
+ }
+ if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+ src += stride << subsampling_y;
+ }
+ }
+ const int average = RightShiftWithRounding(
+ sum, FloorLog2(block_width) + FloorLog2(block_height));
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ luma[y][x] -= average;
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ assert(xstep > 0);
+
+ // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+ // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+ if (xstep == 64) {
+ // 7.11.2.10. Intra edge upsample selection process
+ // if ( d <= 0 || d >= 40 ) useUpsample = 0
+ // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+ // |predictor_angle| is 45 the delta is also 45.
+ assert(!upsampled_top);
+ const Pixel* top_ptr = top + 1;
+ for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+ memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+ }
+ return;
+ }
+
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_x], width);
+ dst += stride;
+ }
+ return;
+ }
+
+ const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+ int x = 0;
+ do {
+ if (top_base_x >= max_base_x) {
+ Memset(dst + x, top[max_base_x], width - x);
+ break;
+ }
+
+ const int val =
+ top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+ dst[x] = RightShiftWithRounding(val, 5);
+ top_base_x += base_step;
+ } while (++x < width);
+
+ dst += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ assert(xstep > 0);
+ assert(ystep > 0);
+
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int scale_bits_x = 6 - upsample_top_shift;
+ const int scale_bits_y = 6 - upsample_left_shift;
+ const int min_base_x = -(1 << upsample_top_shift);
+ const int base_step_x = 1 << upsample_top_shift;
+ int y = 0;
+ int top_x = -xstep;
+ do {
+ int top_base_x = top_x >> scale_bits_x;
+ int left_y = (y << 6) - ystep;
+ int x = 0;
+ do {
+ int val;
+ if (top_base_x >= min_base_x) {
+ const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+ val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+ } else {
+ // Note this assumes an arithmetic shift to handle negative values.
+ const int left_base_y = left_y >> scale_bits_y;
+ const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+ assert(left_base_y >= -(1 << upsample_left_shift));
+ val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+ }
+ dst[x] = RightShiftWithRounding(val, 5);
+ top_base_x += base_step_x;
+ left_y -= ystep;
+ } while (++x < width);
+
+ top_x -= xstep;
+ dst += stride;
+ } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled_left) {
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ stride /= sizeof(Pixel);
+
+ assert(ystep > 0);
+
+ const int upsample_shift = static_cast<int>(upsampled_left);
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> scale_bits) +
+ base_step * (height - 1)); // left_base_y
+
+ int left_y = ystep;
+ int x = 0;
+ do {
+ auto* dst = static_cast<Pixel*>(dest);
+
+ int left_base_y = left_y >> scale_bits;
+ int y = 0;
+ do {
+ const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const int val =
+ left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+ dst[x] = RightShiftWithRounding(val, 5);
+ dst += stride;
+ left_base_y += base_step;
+ } while (++y < height);
+
+ left_y += ystep;
+ } while (++x < width);
+}
+
+//------------------------------------------------------------------------------
+
+template <typename Pixel>
+struct IntraPredDefs {
+ IntraPredDefs() = delete;
+
+ using _4x4 = IntraPredFuncs_C<4, 4, Pixel>;
+ using _4x8 = IntraPredFuncs_C<4, 8, Pixel>;
+ using _4x16 = IntraPredFuncs_C<4, 16, Pixel>;
+ using _8x4 = IntraPredFuncs_C<8, 4, Pixel>;
+ using _8x8 = IntraPredFuncs_C<8, 8, Pixel>;
+ using _8x16 = IntraPredFuncs_C<8, 16, Pixel>;
+ using _8x32 = IntraPredFuncs_C<8, 32, Pixel>;
+ using _16x4 = IntraPredFuncs_C<16, 4, Pixel>;
+ using _16x8 = IntraPredFuncs_C<16, 8, Pixel>;
+ using _16x16 = IntraPredFuncs_C<16, 16, Pixel>;
+ using _16x32 = IntraPredFuncs_C<16, 32, Pixel>;
+ using _16x64 = IntraPredFuncs_C<16, 64, Pixel>;
+ using _32x8 = IntraPredFuncs_C<32, 8, Pixel>;
+ using _32x16 = IntraPredFuncs_C<32, 16, Pixel>;
+ using _32x32 = IntraPredFuncs_C<32, 32, Pixel>;
+ using _32x64 = IntraPredFuncs_C<32, 64, Pixel>;
+ using _64x16 = IntraPredFuncs_C<64, 16, Pixel>;
+ using _64x32 = IntraPredFuncs_C<64, 32, Pixel>;
+ using _64x64 = IntraPredFuncs_C<64, 64, Pixel>;
+};
+
+template <int bitdepth, typename Pixel>
+struct IntraPredBppDefs {
+ IntraPredBppDefs() = delete;
+
+ using _4x4 = IntraPredBppFuncs_C<4, 4, bitdepth, Pixel>;
+ using _4x8 = IntraPredBppFuncs_C<4, 8, bitdepth, Pixel>;
+ using _4x16 = IntraPredBppFuncs_C<4, 16, bitdepth, Pixel>;
+ using _8x4 = IntraPredBppFuncs_C<8, 4, bitdepth, Pixel>;
+ using _8x8 = IntraPredBppFuncs_C<8, 8, bitdepth, Pixel>;
+ using _8x16 = IntraPredBppFuncs_C<8, 16, bitdepth, Pixel>;
+ using _8x32 = IntraPredBppFuncs_C<8, 32, bitdepth, Pixel>;
+ using _16x4 = IntraPredBppFuncs_C<16, 4, bitdepth, Pixel>;
+ using _16x8 = IntraPredBppFuncs_C<16, 8, bitdepth, Pixel>;
+ using _16x16 = IntraPredBppFuncs_C<16, 16, bitdepth, Pixel>;
+ using _16x32 = IntraPredBppFuncs_C<16, 32, bitdepth, Pixel>;
+ using _16x64 = IntraPredBppFuncs_C<16, 64, bitdepth, Pixel>;
+ using _32x8 = IntraPredBppFuncs_C<32, 8, bitdepth, Pixel>;
+ using _32x16 = IntraPredBppFuncs_C<32, 16, bitdepth, Pixel>;
+ using _32x32 = IntraPredBppFuncs_C<32, 32, bitdepth, Pixel>;
+ using _32x64 = IntraPredBppFuncs_C<32, 64, bitdepth, Pixel>;
+ using _64x16 = IntraPredBppFuncs_C<64, 16, bitdepth, Pixel>;
+ using _64x32 = IntraPredBppFuncs_C<64, 32, bitdepth, Pixel>;
+ using _64x64 = IntraPredBppFuncs_C<64, 64, bitdepth, Pixel>;
+};
+
+using Defs = IntraPredDefs<uint8_t>;
+using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS|/|DEFSBPP| of
+// the same size.
+#define INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, W, H) \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcFill] = \
+ DEFSBPP::_##W##x##H::DcFill; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcTop] = \
+ DEFS::_##W##x##H::DcTop; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcLeft] = \
+ DEFS::_##W##x##H::DcLeft; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDc] = \
+ DEFS::_##W##x##H::Dc; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorVertical] = \
+ DEFS::_##W##x##H::Vertical; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
+ DEFS::_##W##x##H::Horizontal; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] = \
+ DEFS::_##W##x##H::Paeth; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
+ DEFS::_##W##x##H::Smooth; \
+ dsp->intra_predictors[kTransformSize##W##x##H] \
+ [kIntraPredictorSmoothVertical] = \
+ DEFS::_##W##x##H::SmoothVertical; \
+ dsp->intra_predictors[kTransformSize##W##x##H] \
+ [kIntraPredictorSmoothHorizontal] = \
+ DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_INTRAPREDICTORS(DEFS, DEFSBPP) \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 8); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 4); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 8); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 32); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 4); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 8); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 32); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 64); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 8); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 32); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 64); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
+
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \
+ dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \
+ CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_INTRAPREDICTORS(Defs, Defs8bpp);
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint8_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint8_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint8_t>;
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+ INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+ Defs8bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ Defs::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ Defs::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = Defs::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ Defs::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ Defs::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Defs::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+ Defs8bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ Defs::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ Defs::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = Defs::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ Defs::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ Defs::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Defs::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+ Defs8bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ Defs::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ Defs::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ Defs::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ Defs::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ Defs::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Defs::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+ Defs8bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ Defs::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ Defs::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = Defs::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ Defs::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ Defs::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Defs::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+ Defs8bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ Defs::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ Defs::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = Defs::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ Defs::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ Defs::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Defs::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+ Defs8bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ Defs::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ Defs::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ Defs::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ Defs::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ Defs::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Defs::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+ Defs8bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ Defs::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ Defs::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ Defs::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ Defs::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ Defs::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Defs::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+ Defs8bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ Defs::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ Defs::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ Defs::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ Defs::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ Defs::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ Defs::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+ Defs8bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ Defs::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ Defs::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ Defs::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ Defs::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ Defs::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ Defs::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+ Defs8bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ Defs::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ Defs::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ Defs::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ Defs::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ Defs::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ Defs::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+ Defs8bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ Defs::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ Defs::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ Defs::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ Defs::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ Defs::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ Defs::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+ Defs8bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ Defs::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ Defs::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ Defs::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ Defs::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ Defs::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ Defs::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+ Defs8bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ Defs::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ Defs::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ Defs::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ Defs::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ Defs::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ Defs::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+ Defs8bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ Defs::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ Defs::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ Defs::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ Defs::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ Defs::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ Defs::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+ Defs8bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ Defs::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ Defs::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ Defs::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ Defs::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ Defs::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ Defs::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+ Defs8bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ Defs::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ Defs::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ Defs::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ Defs::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ Defs::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ Defs::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+ Defs8bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ Defs::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ Defs::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ Defs::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ Defs::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ Defs::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ Defs::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+ Defs8bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ Defs::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ Defs::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ Defs::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ Defs::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ Defs::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ Defs::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+ Defs8bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ Defs::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ Defs::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ Defs::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ Defs::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ Defs::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ Defs::_64x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = IntraPredDefs<uint16_t>;
+using Defs10bpp = IntraPredBppDefs<10, uint16_t>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+ INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+ Defs10bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+ Defs10bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+ Defs10bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+ Defs10bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+ Defs10bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+ Defs10bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+ Defs10bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+ Defs10bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+ Defs10bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+ Defs10bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+ Defs10bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+ Defs10bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+ Defs10bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+ Defs10bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+ Defs10bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+ Defs10bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+ Defs10bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+ Defs10bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+ Defs10bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ DefsHbd::_64x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+#undef INIT_INTRAPREDICTORS_WxH
+#undef INIT_INTRAPREDICTORS
+
+} // namespace
+
+void IntraPredInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/intrapred.h b/src/dsp/intrapred.h
new file mode 100644
index 0000000..c5286ef
--- /dev/null
+++ b/src/dsp/intrapred.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_H_
diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc
new file mode 100644
index 0000000..a03fad2
--- /dev/null
+++ b/src/dsp/inverse_transform.cc
@@ -0,0 +1,1636 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+constexpr uint8_t kTransformColumnShift = 4;
+
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#endif
+
+int32_t RangeCheckValue(int32_t value, int8_t range) {
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+ LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+ assert(range <= 32);
+ const int32_t min = -(1 << (range - 1));
+ const int32_t max = (1 << (range - 1)) - 1;
+ if (min > value || value > max) {
+ LIBGAV1_DLOG(ERROR, "coeff out of bit range, value: %d bit range %d\n",
+ value, range);
+ assert(min <= value && value <= max);
+ }
+#endif // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+ static_cast<void>(range);
+ return value;
+}
+
+template <typename Residual>
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_C(Residual* const dst, int a,
+ int b, int angle, bool flip,
+ int8_t range) {
+ // Note that we multiply in 32 bits and then add/subtract the products in 64
+ // bits. The 32-bit multiplications do not overflow. Please see the comment
+ // and assert() in Cos128().
+ const int64_t x = static_cast<int64_t>(dst[a] * Cos128(angle)) -
+ static_cast<int64_t>(dst[b] * Sin128(angle));
+ const int64_t y = static_cast<int64_t>(dst[a] * Sin128(angle)) +
+ static_cast<int64_t>(dst[b] * Cos128(angle));
+ // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+ // values saved into the array T by this function are representable by a
+ // signed integer using |range| bits of precision.
+ dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+ dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationFirstIsZero_C(Residual* const dst, int a, int b,
+ int angle, bool flip, int8_t range) {
+ // Note that we multiply in 32 bits and then add/subtract the products in 64
+ // bits. The 32-bit multiplications do not overflow. Please see the comment
+ // and assert() in Cos128().
+ const auto x = static_cast<int64_t>(dst[b] * -Sin128(angle));
+ const auto y = static_cast<int64_t>(dst[b] * Cos128(angle));
+ // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+ // values saved into the array T by this function are representable by a
+ // signed integer using |range| bits of precision.
+ dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+ dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationSecondIsZero_C(Residual* const dst, int a, int b,
+ int angle, bool flip, int8_t range) {
+ // Note that we multiply in 32 bits and then add/subtract the products in 64
+ // bits. The 32-bit multiplications do not overflow. Please see the comment
+ // and assert() in Cos128().
+ const auto x = static_cast<int64_t>(dst[a] * Cos128(angle));
+ const auto y = static_cast<int64_t>(dst[a] * Sin128(angle));
+
+ // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+ // values saved into the array T by this function are representable by a
+ // signed integer using |range| bits of precision.
+ dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+ dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void HadamardRotation_C(Residual* const dst, int a, int b, bool flip,
+ int8_t range) {
+ if (flip) std::swap(a, b);
+ --range;
+ // For Adst and Dct, the maximum possible value for range is 20. So min and
+ // max should always fit into int32_t.
+ const int32_t min = -(1 << range);
+ const int32_t max = (1 << range) - 1;
+ const int32_t x = dst[a] + dst[b];
+ const int32_t y = dst[a] - dst[b];
+ dst[a] = Clip3(x, min, max);
+ dst[b] = Clip3(y, min, max);
+}
+
+template <int bitdepth, typename Residual>
+void ClampIntermediate(Residual* const dst, int size) {
+ // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+ // clip residual[i][j] to 16 bits.
+ if (sizeof(Residual) > 2) {
+ const Residual intermediate_clamp_max =
+ (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+ const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+ for (int j = 0; j < size; ++j) {
+ dst[j] = Clip3(dst[j], intermediate_clamp_min, intermediate_clamp_max);
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+// Value for index (i, j) is computed as bitreverse(j) and interpreting that as
+// an integer with bit-length i + 2.
+// For e.g. index (2, 3) will be computed as follows:
+// * bitreverse(3) = bitreverse(..000011) = 110000...
+// * interpreting that as an integer with bit-length 2+2 = 4 will be 1100 = 12
+constexpr uint8_t kBitReverseLookup[kNum1DTransformSizes][64] = {
+ {0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2,
+ 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3,
+ 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3},
+ {0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5,
+ 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6,
+ 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7},
+ {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15},
+ {0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31},
+ {0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63}};
+
+template <typename Residual, int size_log2>
+void Dct_C(void* dest, int8_t range) {
+ static_assert(size_log2 >= 2 && size_log2 <= 6, "");
+ auto* const dst = static_cast<Residual*>(dest);
+ // stage 1.
+ const int size = 1 << size_log2;
+ Residual temp[size];
+ memcpy(temp, dst, sizeof(temp));
+ for (int i = 0; i < size; ++i) {
+ dst[i] = temp[kBitReverseLookup[size_log2 - 2][i]];
+ }
+ // stages 2-32 are dependent on the value of size_log2.
+ // stage 2.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 16; ++i) {
+ ButterflyRotation_C(dst, i + 32, 63 - i,
+ 63 - MultiplyBy4(kBitReverseLookup[2][i]), false,
+ range);
+ }
+ }
+ // stage 3
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 8; ++i) {
+ ButterflyRotation_C(dst, i + 16, 31 - i,
+ 6 + MultiplyBy8(kBitReverseLookup[1][7 - i]), false,
+ range);
+ }
+ }
+ // stage 4.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 16; ++i) {
+ HadamardRotation_C(dst, MultiplyBy2(i) + 32, MultiplyBy2(i) + 33,
+ static_cast<bool>(i & 1), range);
+ }
+ }
+ // stage 5.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(dst, i + 8, 15 - i,
+ 12 + MultiplyBy16(kBitReverseLookup[0][3 - i]), false,
+ range);
+ }
+ }
+ // stage 6.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 8; ++i) {
+ HadamardRotation_C(dst, MultiplyBy2(i) + 16, MultiplyBy2(i) + 17,
+ static_cast<bool>(i & 1), range);
+ }
+ }
+ // stage 7.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ButterflyRotation_C(
+ dst, 62 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 33,
+ 60 - MultiplyBy16(kBitReverseLookup[0][i]) + MultiplyBy64(j), true,
+ range);
+ }
+ }
+ }
+ // stage 8.
+ if (size_log2 >= 3) {
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(dst, i + 4, 7 - i, 56 - 32 * i, false, range);
+ }
+ }
+ // stage 9.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 4; ++i) {
+ HadamardRotation_C(dst, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+ static_cast<bool>(i & 1), range);
+ }
+ }
+ // stage 10.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ButterflyRotation_C(
+ dst, 30 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 17,
+ 24 + MultiplyBy64(j) + MultiplyBy32(1 - i), true, range);
+ }
+ }
+ }
+ // stage 11.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(dst, MultiplyBy4(i) + j + 32,
+ MultiplyBy4(i) - j + 35, static_cast<bool>(i & 1),
+ range);
+ }
+ }
+ }
+ // stage 12.
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(dst, MultiplyBy2(i), MultiplyBy2(i) + 1, 32 + 16 * i,
+ i == 0, range);
+ }
+ // stage 13.
+ if (size_log2 >= 3) {
+ for (int i = 0; i < 2; ++i) {
+ HadamardRotation_C(dst, MultiplyBy2(i) + 4, MultiplyBy2(i) + 5,
+ /*flip=*/i != 0, range);
+ }
+ }
+ // stage 14.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(dst, 14 - i, i + 9, 48 + 64 * i, true, range);
+ }
+ }
+ // stage 15.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(dst, MultiplyBy4(i) + j + 16,
+ MultiplyBy4(i) - j + 19, static_cast<bool>(i & 1),
+ range);
+ }
+ }
+ }
+ // stage 16.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ ButterflyRotation_C(
+ dst, 61 - MultiplyBy8(i) - j, MultiplyBy8(i) + j + 34,
+ 56 - MultiplyBy32(i) + MultiplyBy64(DivideBy2(j)), true, range);
+ }
+ }
+ }
+ // stage 17.
+ for (int i = 0; i < 2; ++i) {
+ HadamardRotation_C(dst, i, 3 - i, false, range);
+ }
+ // stage 18.
+ if (size_log2 >= 3) {
+ ButterflyRotation_C(dst, 6, 5, 32, true, range);
+ }
+ // stage 19.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(dst, MultiplyBy4(i) + j + 8, MultiplyBy4(i) - j + 11,
+ /*flip=*/i != 0, range);
+ }
+ }
+ }
+ // stage 20.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(dst, 29 - i, i + 18, 48 + 64 * DivideBy2(i), true,
+ range);
+ }
+ }
+ // stage 21.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ HadamardRotation_C(dst, MultiplyBy8(i) + j + 32,
+ MultiplyBy8(i) - j + 39, static_cast<bool>(i & 1),
+ range);
+ }
+ }
+ }
+ // stage 22.
+ if (size_log2 >= 3) {
+ for (int i = 0; i < 4; ++i) {
+ HadamardRotation_C(dst, i, 7 - i, false, range);
+ }
+ }
+ // stage 23.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(dst, 13 - i, i + 10, 32, true, range);
+ }
+ }
+ // stage 24.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ HadamardRotation_C(dst, MultiplyBy8(i) + j + 16,
+ MultiplyBy8(i) - j + 23, i == 1, range);
+ }
+ }
+ }
+ // stage 25.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 8; ++i) {
+ ButterflyRotation_C(dst, 59 - i, i + 36, (i < 4) ? 48 : 112, true, range);
+ }
+ }
+ // stage 26.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 8; ++i) {
+ HadamardRotation_C(dst, i, 15 - i, false, range);
+ }
+ }
+ // stage 27.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(dst, 27 - i, i + 20, 32, true, range);
+ }
+ }
+ // stage 28.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 8; ++i) {
+ HadamardRotation_C(dst, i + 32, 47 - i, false, range);
+ HadamardRotation_C(dst, i + 48, 63 - i, true, range);
+ }
+ }
+ // stage 29.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 16; ++i) {
+ HadamardRotation_C(dst, i, 31 - i, false, range);
+ }
+ }
+ // stage 30.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 8; ++i) {
+ ButterflyRotation_C(dst, 55 - i, i + 40, 32, true, range);
+ }
+ }
+ // stage 31.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 32; ++i) {
+ HadamardRotation_C(dst, i, 63 - i, false, range);
+ }
+ }
+}
+
+template <int bitdepth, typename Residual, int size_log2>
+void DctDcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row && should_round) {
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ }
+
+ ButterflyRotationSecondIsZero_C(dst, 0, 1, 32, true, range);
+
+ if (is_row && row_shift > 0) {
+ dst[0] = RightShiftWithRounding(dst[0], row_shift);
+ }
+
+ ClampIntermediate<bitdepth, Residual>(dst, 1);
+
+ const int size = 1 << size_log2;
+ for (int i = 1; i < size; ++i) {
+ dst[i] = dst[0];
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+/*
+ * Row transform max range in bits for bitdepths 8/10/12: 28/30/32.
+ * Column transform max range in bits for bitdepths 8/10/12: 28/28/30.
+ */
+template <typename Residual>
+void Adst4_C(void* dest, int8_t range) {
+ auto* const dst = static_cast<Residual*>(dest);
+ if ((dst[0] | dst[1] | dst[2] | dst[3]) == 0) {
+ return;
+ }
+
+ // stage 1.
+ // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+ // values stored in the s and x arrays by this process are representable by
+ // a signed integer using range + 12 bits of precision.
+ int32_t s[7];
+ s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+ s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+ s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
+ s[3] = RangeCheckValue(kAdst4Multiplier[3] * dst[2], range + 12);
+ s[4] = RangeCheckValue(kAdst4Multiplier[0] * dst[2], range + 12);
+ s[5] = RangeCheckValue(kAdst4Multiplier[1] * dst[3], range + 12);
+ s[6] = RangeCheckValue(kAdst4Multiplier[3] * dst[3], range + 12);
+ // stage 2.
+ // Section 7.13.2.6: It is a requirement of bitstream conformance that
+ // values stored in the variable a7 by this process are representable by a
+ // signed integer using range + 1 bits of precision.
+ const int32_t a7 = RangeCheckValue(dst[0] - dst[2], range + 1);
+ // Section 7.13.2.6: It is a requirement of bitstream conformance that
+ // values stored in the variable b7 by this process are representable by a
+ // signed integer using |range| bits of precision.
+ const int32_t b7 = RangeCheckValue(a7 + dst[3], range);
+ // stage 3.
+ s[0] = RangeCheckValue(s[0] + s[3], range + 12);
+ s[1] = RangeCheckValue(s[1] - s[4], range + 12);
+ s[3] = s[2];
+ s[2] = RangeCheckValue(kAdst4Multiplier[2] * b7, range + 12);
+ // stage 4.
+ s[0] = RangeCheckValue(s[0] + s[5], range + 12);
+ s[1] = RangeCheckValue(s[1] - s[6], range + 12);
+ // stages 5 and 6.
+ const int32_t x0 = RangeCheckValue(s[0] + s[3], range + 12);
+ const int32_t x1 = RangeCheckValue(s[1] + s[3], range + 12);
+ int32_t x3 = RangeCheckValue(s[0] + s[1], range + 12);
+ x3 = RangeCheckValue(x3 - s[3], range + 12);
+ int32_t dst_0 = RightShiftWithRounding(x0, 12);
+ int32_t dst_1 = RightShiftWithRounding(x1, 12);
+ int32_t dst_2 = RightShiftWithRounding(s[2], 12);
+ int32_t dst_3 = RightShiftWithRounding(x3, 12);
+ if (sizeof(Residual) == 2) {
+ // If the first argument to RightShiftWithRounding(..., 12) is only
+ // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+ // in RightShiftWithRounding(..., 12) will cause the function to return
+ // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+ dst_0 -= (dst_0 == 0x8000);
+ dst_1 -= (dst_1 == 0x8000);
+ dst_3 -= (dst_3 == 0x8000);
+ }
+ dst[0] = dst_0;
+ dst[1] = dst_1;
+ dst[2] = dst_2;
+ dst[3] = dst_3;
+}
+
+template <int bitdepth, typename Residual>
+void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row && should_round) {
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ }
+
+ // stage 1.
+ // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+ // values stored in the s and x arrays by this process are representable by
+ // a signed integer using range + 12 bits of precision.
+ int32_t s[3];
+ s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+ s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+ s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[0], range + 12);
+ // stage 3.
+ // stage 4.
+ // stages 5 and 6.
+ int32_t dst_0 = RightShiftWithRounding(s[0], 12);
+ int32_t dst_1 = RightShiftWithRounding(s[1], 12);
+ int32_t dst_2 = RightShiftWithRounding(s[2], 12);
+ int32_t dst_3 =
+ RightShiftWithRounding(RangeCheckValue(s[0] + s[1], range + 12), 12);
+ if (sizeof(Residual) == 2) {
+ // If the first argument to RightShiftWithRounding(..., 12) is only
+ // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+ // in RightShiftWithRounding(..., 12) will cause the function to return
+ // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+ dst_0 -= (dst_0 == 0x8000);
+ dst_1 -= (dst_1 == 0x8000);
+ dst_3 -= (dst_3 == 0x8000);
+ }
+ dst[0] = dst_0;
+ dst[1] = dst_1;
+ dst[2] = dst_2;
+ dst[3] = dst_3;
+
+ const int size = 4;
+ if (is_row && row_shift > 0) {
+ for (int j = 0; j < size; ++j) {
+ dst[j] = RightShiftWithRounding(dst[j], row_shift);
+ }
+ }
+
+ ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+template <typename Residual>
+void AdstInputPermutation(int32_t* const dst, const Residual* const src,
+ int n) {
+ assert(n == 8 || n == 16);
+ for (int i = 0; i < n; ++i) {
+ dst[i] = src[((i & 1) == 0) ? n - i - 1 : i - 1];
+ }
+}
+
+constexpr int8_t kAdstOutputPermutationLookup[16] = {
+ 0, 8, 12, 4, 6, 14, 10, 2, 3, 11, 15, 7, 5, 13, 9, 1};
+
+template <typename Residual>
+void AdstOutputPermutation(Residual* const dst, const int32_t* const src,
+ int n) {
+ assert(n == 8 || n == 16);
+ const auto shift = static_cast<int8_t>(n == 8);
+ for (int i = 0; i < n; ++i) {
+ const int8_t index = kAdstOutputPermutationLookup[i] >> shift;
+ int32_t dst_i = ((i & 1) == 0) ? src[index] : -src[index];
+ if (sizeof(Residual) == 2) {
+ // If i is odd and src[index] is -32768, dst_i will be 32768, which
+ // cannot be represented as an int16_t.
+ dst_i -= (dst_i == 0x8000);
+ }
+ dst[i] = dst_i;
+ }
+}
+
+template <typename Residual>
+void Adst8_C(void* dest, int8_t range) {
+ auto* const dst = static_cast<Residual*>(dest);
+ // stage 1.
+ int32_t temp[8];
+ AdstInputPermutation(temp, dst, 8);
+ // stage 2.
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 60 - 16 * i,
+ true, range);
+ }
+ // stage 3.
+ for (int i = 0; i < 4; ++i) {
+ HadamardRotation_C(temp, i, i + 4, false, range);
+ }
+ // stage 4.
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(temp, i * 3 + 4, i + 5, 48 - 32 * i, true, range);
+ }
+ // stage 5.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+ false, range);
+ }
+ }
+ // stage 6.
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+ range);
+ }
+ // stage 7.
+ AdstOutputPermutation(dst, temp, 8);
+}
+
+template <int bitdepth, typename Residual>
+void Adst8DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ // stage 1.
+ int32_t temp[8];
+ // After the permutation, the dc value is in temp[1]. The remaining are zero.
+ AdstInputPermutation(temp, dst, 8);
+
+ if (is_row && should_round) {
+ temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+ }
+
+ // stage 2.
+ ButterflyRotationFirstIsZero_C(temp, 0, 1, 60, true, range);
+
+ // stage 3.
+ temp[4] = temp[0];
+ temp[5] = temp[1];
+
+ // stage 4.
+ ButterflyRotation_C(temp, 4, 5, 48, true, range);
+
+ // stage 5.
+ temp[2] = temp[0];
+ temp[3] = temp[1];
+ temp[6] = temp[4];
+ temp[7] = temp[5];
+
+ // stage 6.
+ ButterflyRotation_C(temp, 2, 3, 32, true, range);
+ ButterflyRotation_C(temp, 6, 7, 32, true, range);
+
+ // stage 7.
+ AdstOutputPermutation(dst, temp, 8);
+
+ const int size = 8;
+ if (is_row && row_shift > 0) {
+ for (int j = 0; j < size; ++j) {
+ dst[j] = RightShiftWithRounding(dst[j], row_shift);
+ }
+ }
+
+ ClampIntermediate<bitdepth, Residual>(dst, 8);
+}
+
+template <typename Residual>
+void Adst16_C(void* dest, int8_t range) {
+ auto* const dst = static_cast<Residual*>(dest);
+ // stage 1.
+ int32_t temp[16];
+ AdstInputPermutation(temp, dst, 16);
+ // stage 2.
+ for (int i = 0; i < 8; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 62 - 8 * i,
+ true, range);
+ }
+ // stage 3.
+ for (int i = 0; i < 8; ++i) {
+ HadamardRotation_C(temp, i, i + 8, false, range);
+ }
+ // stage 4.
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+ 56 - 32 * i, true, range);
+ ButterflyRotation_C(temp, MultiplyBy2(i) + 13, MultiplyBy2(i) + 12,
+ 8 + 32 * i, true, range);
+ }
+ // stage 5.
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(temp, i + MultiplyBy8(j), i + MultiplyBy8(j) + 4,
+ false, range);
+ }
+ }
+ // stage 6.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ButterflyRotation_C(temp, i * 3 + MultiplyBy8(j) + 4,
+ i + MultiplyBy8(j) + 5, 48 - 32 * i, true, range);
+ }
+ }
+ // stage 7.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+ false, range);
+ }
+ }
+ // stage 8.
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+ range);
+ }
+ // stage 9.
+ AdstOutputPermutation(dst, temp, 16);
+}
+
+template <int bitdepth, typename Residual>
+void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ // stage 1.
+ int32_t temp[16];
+ // After the permutation, the dc value is in temp[1]. The remaining are zero.
+ AdstInputPermutation(temp, dst, 16);
+
+ if (is_row && should_round) {
+ temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+ }
+
+ // stage 2.
+ ButterflyRotationFirstIsZero_C(temp, 0, 1, 62, true, range);
+
+ // stage 3.
+ temp[8] = temp[0];
+ temp[9] = temp[1];
+
+ // stage 4.
+ ButterflyRotation_C(temp, 8, 9, 56, true, range);
+
+ // stage 5.
+ temp[4] = temp[0];
+ temp[5] = temp[1];
+ temp[12] = temp[8];
+ temp[13] = temp[9];
+
+ // stage 6.
+ ButterflyRotation_C(temp, 4, 5, 48, true, range);
+ ButterflyRotation_C(temp, 12, 13, 48, true, range);
+
+ // stage 7.
+ temp[2] = temp[0];
+ temp[3] = temp[1];
+ temp[10] = temp[8];
+ temp[11] = temp[9];
+
+ temp[6] = temp[4];
+ temp[7] = temp[5];
+ temp[14] = temp[12];
+ temp[15] = temp[13];
+
+ // stage 8.
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+ range);
+ }
+
+ // stage 9.
+ AdstOutputPermutation(dst, temp, 16);
+
+ const int size = 16;
+ if (is_row && row_shift > 0) {
+ for (int j = 0; j < size; ++j) {
+ dst[j] = RightShiftWithRounding(dst[j], row_shift);
+ }
+ }
+
+ ClampIntermediate<bitdepth, Residual>(dst, 16);
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+//
+// In the spec, the inverse identity transform is followed by a Round2() call:
+// The row transforms with i = 0..(h-1) are applied as follows:
+// ...
+// * Otherwise, invoke the inverse identity transform process specified in
+// section 7.13.2.15 with the input variable n equal to log2W.
+// * Set Residual[ i ][ j ] equal to Round2( T[ j ], rowShift )
+// for j = 0..(w-1).
+// ...
+// The column transforms with j = 0..(w-1) are applied as follows:
+// ...
+// * Otherwise, invoke the inverse identity transform process specified in
+// section 7.13.2.15 with the input variable n equal to log2H.
+// * Residual[ i ][ j ] is set equal to Round2( T[ i ], colShift )
+// for i = 0..(h-1).
+//
+// Therefore, we define the identity transform functions to perform both the
+// inverse identity transform and the Round2() call. This has two advantages:
+// 1. The outputs of the inverse identity transform do not need to be stored
+// in the Residual array. They can be stored in int32_t local variables,
+// which have a larger range if Residual is an int16_t array.
+// 2. The inverse identity transform and the Round2() call can be jointly
+// optimized.
+//
+// The identity transform functions have the following prototype:
+// void Identity_C(void* dest, int8_t shift);
+//
+// The |shift| parameter is the amount of shift for the Round2() call. For row
+// transforms, |shift| is 0, 1, or 2. For column transforms, |shift| is always
+// 4. Therefore, an identity transform function can detect whether it is being
+// invoked as a row transform or a column transform by checking whether |shift|
+// is equal to 4.
+//
+// Input Range
+//
+// The inputs of row transforms, stored in the 2D array Dequant, are
+// representable by a signed integer using 8 + BitDepth bits of precision:
+// f. Dequant[ i ][ j ] is set equal to
+// Clip3( - ( 1 << ( 7 + BitDepth ) ), ( 1 << ( 7 + BitDepth ) ) - 1, dq2 ).
+//
+// The inputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) bits of precision:
+// Set the variable colClampRange equal to Max( BitDepth + 6, 16 ).
+// ...
+// Between the row and column transforms, Residual[ i ][ j ] is set equal to
+// Clip3( - ( 1 << ( colClampRange - 1 ) ),
+// ( 1 << (colClampRange - 1 ) ) - 1,
+// Residual[ i ][ j ] )
+// for i = 0..(h-1), for j = 0..(w-1).
+//
+// Output Range
+//
+// The outputs of row transforms are representable by a signed integer using
+// 8 + BitDepth + 1 = 9 + BitDepth bits of precision, because the net effect
+// of the multiplicative factor of inverse identity transforms minus the
+// smallest row shift is an increase of at most one bit.
+//
+// Transform | Multiplicative factor | Smallest row | Net increase
+// width | (in bits) | shift | in bits
+// ---------------------------------------------------------------
+// 4 | sqrt(2) (0.5 bits) | 0 | +0.5
+// 8 | 2 (1 bit) | 0 | +1
+// 16 | 2*sqrt(2) (1.5 bits) | 1 | +0.5
+// 32 | 4 (2 bits) | 1 | +1
+//
+// If BitDepth is 8 and Residual is an int16_t array, to avoid truncation we
+// clip the outputs (which have 17 bits of precision) to the range of int16_t
+// before storing them in the Residual array. This clipping happens to be the
+// same as the required clipping after the row transform (see the spec quoted
+// above), so we remain compliant with the spec. (In this case,
+// TransformLoop_C() skips clipping the outputs of row transforms to avoid
+// duplication of effort.)
+//
+// The outputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) + 2 - 4 = Max( BitDepth + 4, 14 ) bits of precision,
+// because the multiplicative factor of inverse identity transforms is at most
+// 4 (2 bits) and |shift| is always 4.
+
+template <typename Residual>
+void Identity4Row_C(void* dest, int8_t shift) {
+ assert(shift == 0 || shift == 1);
+ auto* const dst = static_cast<Residual*>(dest);
+ // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
+ // should be (1 + (1 << 1)) << 11. The following expression works for both
+ // values of |shift|.
+ const int32_t rounding = (1 + (shift << 1)) << 11;
+ for (int i = 0; i < 4; ++i) {
+ // The intermediate value here will have to fit into an int32_t for it to be
+ // bitstream conformant. The multiplication is promoted to int32_t by
+ // defining kIdentity4Multiplier as int32_t.
+ int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[i] = static_cast<Residual>(dst_i);
+ }
+}
+
+template <typename Residual>
+void Identity4Column_C(void* dest, int8_t /*shift*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+ for (int i = 0; i < 4; ++i) {
+ // The intermediate value here will have to fit into an int32_t for it to be
+ // bitstream conformant. The multiplication is promoted to int32_t by
+ // defining kIdentity4Multiplier as int32_t.
+ dst[i] = static_cast<Residual>((dst[i] * kIdentity4Multiplier + rounding) >>
+ (12 + kTransformColumnShift));
+ }
+}
+
+template <int bitdepth, typename Residual>
+void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row) {
+ if (should_round) {
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ }
+
+ const int32_t rounding = (1 + (row_shift << 1)) << 11;
+ int32_t dst_i =
+ (dst[0] * kIdentity4Multiplier + rounding) >> (12 + row_shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[0] = static_cast<Residual>(dst_i);
+
+ ClampIntermediate<bitdepth, Residual>(dst, 1);
+ return;
+ }
+
+ const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+ dst[0] = static_cast<Residual>((dst[0] * kIdentity4Multiplier + rounding) >>
+ (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity8Row_C(void* dest, int8_t shift) {
+ assert(shift == 0 || shift == 1 || shift == 2);
+ auto* const dst = static_cast<Residual*>(dest);
+ for (int i = 0; i < 8; ++i) {
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[i]), shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[i] = static_cast<Residual>(dst_i);
+ }
+}
+
+template <typename Residual>
+void Identity8Column_C(void* dest, int8_t /*shift*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ for (int i = 0; i < 8; ++i) {
+ dst[i] = static_cast<Residual>(
+ RightShiftWithRounding(dst[i], kTransformColumnShift - 1));
+ }
+}
+
+template <int bitdepth, typename Residual>
+void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row) {
+ if (should_round) {
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ }
+
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[0] = static_cast<Residual>(dst_i);
+
+ // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+ // clip residual[i][j] to 16 bits.
+ if (sizeof(Residual) > 2) {
+ const Residual intermediate_clamp_max =
+ (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+ const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+ dst[0] = Clip3(dst[0], intermediate_clamp_min, intermediate_clamp_max);
+ }
+ return;
+ }
+
+ dst[0] = static_cast<Residual>(
+ RightShiftWithRounding(dst[0], kTransformColumnShift - 1));
+}
+
+template <typename Residual>
+void Identity16Row_C(void* dest, int8_t shift) {
+ assert(shift == 1 || shift == 2);
+ auto* const dst = static_cast<Residual*>(dest);
+ const int32_t rounding = (1 + (1 << shift)) << 11;
+ for (int i = 0; i < 16; ++i) {
+ // The intermediate value here will have to fit into an int32_t for it to be
+ // bitstream conformant. The multiplication is promoted to int32_t by
+ // defining kIdentity16Multiplier as int32_t.
+ int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[i] = static_cast<Residual>(dst_i);
+ }
+}
+
+template <typename Residual>
+void Identity16Column_C(void* dest, int8_t /*shift*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+ for (int i = 0; i < 16; ++i) {
+ // The intermediate value here will have to fit into an int32_t for it to be
+ // bitstream conformant. The multiplication is promoted to int32_t by
+ // defining kIdentity16Multiplier as int32_t.
+ dst[i] =
+ static_cast<Residual>((dst[i] * kIdentity16Multiplier + rounding) >>
+ (12 + kTransformColumnShift));
+ }
+}
+
+template <int bitdepth, typename Residual>
+void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row) {
+ if (should_round) {
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ }
+
+ const int32_t rounding = (1 + (1 << row_shift)) << 11;
+ int32_t dst_i =
+ (dst[0] * kIdentity16Multiplier + rounding) >> (12 + row_shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[0] = static_cast<Residual>(dst_i);
+
+ ClampIntermediate<bitdepth, Residual>(dst, 1);
+ return;
+ }
+
+ const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+ dst[0] = static_cast<Residual>((dst[0] * kIdentity16Multiplier + rounding) >>
+ (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity32Row_C(void* dest, int8_t shift) {
+ assert(shift == 1 || shift == 2);
+ auto* const dst = static_cast<Residual*>(dest);
+ for (int i = 0; i < 32; ++i) {
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[i]), shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[i] = static_cast<Residual>(dst_i);
+ }
+}
+
+template <typename Residual>
+void Identity32Column_C(void* dest, int8_t /*shift*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ for (int i = 0; i < 32; ++i) {
+ dst[i] = static_cast<Residual>(
+ RightShiftWithRounding(dst[i], kTransformColumnShift - 2));
+ }
+}
+
+template <int bitdepth, typename Residual>
+void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row) {
+ if (should_round) {
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ }
+
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[0] = static_cast<Residual>(dst_i);
+
+ ClampIntermediate<bitdepth, Residual>(dst, 1);
+ return;
+ }
+
+ dst[0] = static_cast<Residual>(
+ RightShiftWithRounding(dst[0], kTransformColumnShift - 2));
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+template <typename Residual>
+void Wht4_C(void* dest, int8_t shift) {
+ auto* const dst = static_cast<Residual*>(dest);
+ Residual temp[4];
+ temp[0] = dst[0] >> shift;
+ temp[2] = dst[1] >> shift;
+ temp[3] = dst[2] >> shift;
+ temp[1] = dst[3] >> shift;
+ temp[0] += temp[2];
+ temp[3] -= temp[1];
+ // This signed right shift must be an arithmetic shift.
+ Residual e = (temp[0] - temp[3]) >> 1;
+ dst[1] = e - temp[1];
+ dst[2] = e - temp[2];
+ dst[0] = temp[0] - dst[1];
+ dst[3] = temp[3] + dst[2];
+}
+
+template <int bitdepth, typename Residual>
+void Wht4DcOnly_C(void* dest, int8_t range, bool /*should_round*/,
+ int /*row_shift*/, bool /*is_row*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ const int shift = range;
+
+ Residual temp = dst[0] >> shift;
+ // This signed right shift must be an arithmetic shift.
+ Residual e = temp >> 1;
+ dst[0] = temp - e;
+ dst[1] = e;
+ dst[2] = e;
+ dst[3] = e;
+
+ ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loop
+
+using InverseTransform1DFunc = void (*)(void* dst, int8_t range);
+using InverseTransformDcOnlyFunc = void (*)(void* dest, int8_t range,
+ bool should_round, int row_shift,
+ bool is_row);
+
+template <int bitdepth, typename Residual, typename Pixel,
+ Transform1D transform1d_type,
+ InverseTransformDcOnlyFunc dconly_transform1d,
+ InverseTransform1DFunc transform1d_func, bool is_row>
+void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ constexpr bool lossless = transform1d_type == k1DTransformWht;
+ constexpr bool is_identity = transform1d_type == k1DTransformIdentity;
+ // The transform size of the WHT is always 4x4. Setting tx_width and
+ // tx_height to the constant 4 for the WHT speeds the code up.
+ assert(!lossless || tx_size == kTransformSize4x4);
+ const int tx_width = lossless ? 4 : kTransformWidth[tx_size];
+ const int tx_height = lossless ? 4 : kTransformHeight[tx_size];
+ const int tx_width_log2 = kTransformWidthLog2[tx_size];
+ const int tx_height_log2 = kTransformHeightLog2[tx_size];
+ auto* frame = static_cast<Array2DView<Pixel>*>(dst_frame);
+
+ // Initially this points to the dequantized values. After the transforms are
+ // applied, this buffer contains the residual.
+ Array2DView<Residual> residual(tx_height, tx_width,
+ static_cast<Residual*>(src_buffer));
+
+ if (is_row) {
+ // Row transform.
+ const uint8_t row_shift = lossless ? 0 : kTransformRowShift[tx_size];
+ // This is the |range| parameter of the InverseTransform1DFunc. For lossy
+ // transforms, this will be equal to the clamping range.
+ const int8_t row_clamp_range = lossless ? 2 : (bitdepth + 8);
+ // If the width:height ratio of the transform size is 2:1 or 1:2, multiply
+ // the input to the row transform by 1 / sqrt(2), which is approximated by
+ // the fraction 2896 / 2^12.
+ const bool should_round = std::abs(tx_width_log2 - tx_height_log2) == 1;
+
+ if (adjusted_tx_height == 1) {
+ dconly_transform1d(residual[0], row_clamp_range, should_round, row_shift,
+ true);
+ return;
+ }
+
+ // Row transforms need to be done only up to 32 because the rest of the rows
+ // are always all zero if |tx_height| is 64. Otherwise, only process the
+ // rows that have a non zero coefficients.
+ for (int i = 0; i < adjusted_tx_height; ++i) {
+ // If lossless, the transform size is 4x4, so should_round is false.
+ if (!lossless && should_round) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int j = 0; j < std::min(tx_width, 32); ++j) {
+ residual[i][j] = RightShiftWithRounding(
+ residual[i][j] * kTransformRowMultiplier, 12);
+ }
+ }
+ // For identity transform, |transform1d_func| also performs the
+ // Round2(T[j], rowShift) call in the spec.
+ transform1d_func(residual[i], is_identity ? row_shift : row_clamp_range);
+ if (!lossless && !is_identity && row_shift > 0) {
+ for (int j = 0; j < tx_width; ++j) {
+ residual[i][j] = RightShiftWithRounding(residual[i][j], row_shift);
+ }
+ }
+
+ ClampIntermediate<bitdepth, Residual>(residual[i], tx_width);
+ }
+ return;
+ }
+
+ assert(!is_row);
+ constexpr uint8_t column_shift = lossless ? 0 : kTransformColumnShift;
+ // This is the |range| parameter of the InverseTransform1DFunc. For lossy
+ // transforms, this will be equal to the clamping range.
+ const int8_t column_clamp_range = lossless ? 0 : std::max(bitdepth + 6, 16);
+ const bool flip_rows = transform1d_type == k1DTransformAdst &&
+ kTransformFlipRowsMask.Contains(tx_type);
+ const bool flip_columns =
+ !lossless && kTransformFlipColumnsMask.Contains(tx_type);
+ const int min_value = 0;
+ const int max_value = (1 << bitdepth) - 1;
+ // Note: 64 is the maximum size of a 1D transform buffer (the largest
+ // transform size is kTransformSize64x64).
+ Residual tx_buffer[64];
+ for (int j = 0; j < tx_width; ++j) {
+ const int flipped_j = flip_columns ? tx_width - j - 1 : j;
+ for (int i = 0; i < tx_height; ++i) {
+ tx_buffer[i] = residual[i][flipped_j];
+ }
+ if (adjusted_tx_height == 1) {
+ dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
+ } else {
+ // For identity transform, |transform1d_func| also performs the
+ // Round2(T[i], colShift) call in the spec.
+ transform1d_func(tx_buffer,
+ is_identity ? column_shift : column_clamp_range);
+ }
+ const int x = start_x + j;
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int index = flip_rows ? tx_height - i - 1 : i;
+ Residual residual_value = tx_buffer[index];
+ if (!lossless && !is_identity) {
+ residual_value = RightShiftWithRounding(residual_value, column_shift);
+ }
+ (*frame)[y][x] =
+ Clip3((*frame)[y][x] + residual_value, min_value, max_value);
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+template <int bitdepth, typename Residual, typename Pixel>
+void InitAll(Dsp* const dsp) {
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+ /*is_row=*/false>;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+ /*is_row=*/false>;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity4DcOnly_C<bitdepth, Residual>,
+ Identity4Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity4DcOnly_C<bitdepth, Residual>,
+ Identity4Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity8DcOnly_C<bitdepth, Residual>,
+ Identity8Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity8DcOnly_C<bitdepth, Residual>,
+ Identity8Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity16DcOnly_C<bitdepth, Residual>,
+ Identity16Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity16DcOnly_C<bitdepth, Residual>,
+ Identity16Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity32DcOnly_C<bitdepth, Residual>,
+ Identity32Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity32DcOnly_C<bitdepth, Residual>,
+ Identity32Column_C<Residual>, /*is_row=*/false>;
+
+ // Maximum transform size for Wht is 4.
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformWht,
+ Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformWht,
+ Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+ /*is_row=*/false>;
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ for (auto& inverse_transform_by_size : dsp->inverse_transforms) {
+ for (auto& inverse_transform : inverse_transform_by_size) {
+ inverse_transform[kRow] = nullptr;
+ inverse_transform[kColumn] = nullptr;
+ }
+ }
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ InitAll<8, int16_t, uint8_t>(dsp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity4DcOnly_C<8, int16_t>, Identity4Row_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity4DcOnly_C<8, int16_t>, Identity4Column_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity8DcOnly_C<8, int16_t>, Identity8Row_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity8DcOnly_C<8, int16_t>, Identity8Column_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity16DcOnly_C<8, int16_t>, Identity16Row_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity16DcOnly_C<8, int16_t>,
+ Identity16Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity32DcOnly_C<8, int16_t>, Identity32Row_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity32DcOnly_C<8, int16_t>,
+ Identity32Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht,
+ Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht,
+ Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ for (auto& inverse_transform_by_size : dsp->inverse_transforms) {
+ for (auto& inverse_transform : inverse_transform_by_size) {
+ inverse_transform[kRow] = nullptr;
+ inverse_transform[kColumn] = nullptr;
+ }
+ }
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ InitAll<10, int32_t, uint16_t>(dsp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity4DcOnly_C<10, int32_t>, Identity4Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity4DcOnly_C<10, int32_t>,
+ Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity8DcOnly_C<10, int32_t>, Identity8Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity8DcOnly_C<10, int32_t>,
+ Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity16DcOnly_C<10, int32_t>, Identity16Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity16DcOnly_C<10, int32_t>,
+ Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformIdentity
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity32DcOnly_C<10, int32_t>, Identity32Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity32DcOnly_C<10, int32_t>,
+ Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformWht
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht,
+ Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht,
+ Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+
+void InverseTransformInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+
+ // Local functions that may be unused depending on the optimizations
+ // available.
+ static_cast<void>(RangeCheckValue);
+ static_cast<void>(kBitReverseLookup);
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/inverse_transform.h b/src/dsp/inverse_transform.h
new file mode 100644
index 0000000..0916665
--- /dev/null
+++ b/src/dsp/inverse_transform.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+#define LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/inverse_transform_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/inverse_transform_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms. This function is not thread-safe.
+void InverseTransformInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
diff --git a/src/dsp/inverse_transform.inc b/src/dsp/inverse_transform.inc
new file mode 100644
index 0000000..55e68b6
--- /dev/null
+++ b/src/dsp/inverse_transform.inc
@@ -0,0 +1,64 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for inverse transform implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// The value at index i is derived as: round(cos(pi * i / 128) * (1 << 12)).
+constexpr int16_t kCos128[65] = {
+ 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+ 3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+ 3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+ 2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+ 1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+ 897, 799, 700, 601, 501, 401, 301, 201, 101, 0};
+
+inline int16_t Cos128(int angle) {
+ angle &= 0xff;
+
+ // If |angle| is 128, this function returns -4096 (= -2^12), which will
+ // cause the 32-bit multiplications in ButterflyRotation() to overflow if
+ // dst[a] or dst[b] is -2^19 (a possible corner case when |range| is 20):
+ //
+ // (-2^12) * (-2^19) = 2^31, which cannot be represented as an int32_t.
+ //
+ // Note: |range| is 20 when bitdepth is 12 and a row transform is performed.
+ //
+ // Assert that this angle is never used by DCT or ADST.
+ assert(angle != 128);
+ if (angle <= 64) return kCos128[angle];
+ if (angle <= 128) return -kCos128[128 - angle];
+ if (angle <= 192) return -kCos128[angle - 128];
+ return kCos128[256 - angle];
+}
+
+inline int16_t Sin128(int angle) { return Cos128(angle - 64); }
+
+// The value for index i is derived as:
+// round(sqrt(2) * sin(i * pi / 9) * 2 / 3 * (1 << 12)).
+constexpr int16_t kAdst4Multiplier[4] = {1321, 2482, 3344, 3803};
+
+constexpr uint8_t kTransformRowShift[kNumTransformSizes] = {
+ 0, 0, 1, 0, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2};
+
+constexpr bool kShouldRound[kNumTransformSizes] = {
+ false, true, false, true, false, true, false, false, true, false,
+ true, false, false, true, false, true, false, true, false};
+
+constexpr int16_t kIdentity4Multiplier /* round(2^12 * sqrt(2)) */ = 0x16A1;
+constexpr int16_t kIdentity4MultiplierFraction /* round(2^12 * (sqrt(2) - 1))*/
+ = 0x6A1;
+constexpr int16_t kIdentity16Multiplier /* 2 * round(2^12 * sqrt(2)) */ = 11586;
+constexpr int16_t kTransformRowMultiplier /* round(2^12 / sqrt(2)) */ = 2896;
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
new file mode 100644
index 0000000..960d5a7
--- /dev/null
+++ b/src/dsp/libgav1_dsp.cmake
@@ -0,0 +1,176 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_)
+ return()
+endif() # LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_
+set(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_ 1)
+
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+
+list(APPEND libgav1_dsp_sources
+ "${libgav1_source}/dsp/average_blend.cc"
+ "${libgav1_source}/dsp/average_blend.h"
+ "${libgav1_source}/dsp/cdef.cc"
+ "${libgav1_source}/dsp/cdef.h"
+ "${libgav1_source}/dsp/cdef.inc"
+ "${libgav1_source}/dsp/common.h"
+ "${libgav1_source}/dsp/constants.cc"
+ "${libgav1_source}/dsp/constants.h"
+ "${libgav1_source}/dsp/convolve.cc"
+ "${libgav1_source}/dsp/convolve.h"
+ "${libgav1_source}/dsp/convolve.inc"
+ "${libgav1_source}/dsp/distance_weighted_blend.cc"
+ "${libgav1_source}/dsp/distance_weighted_blend.h"
+ "${libgav1_source}/dsp/dsp.cc"
+ "${libgav1_source}/dsp/dsp.h"
+ "${libgav1_source}/dsp/film_grain.cc"
+ "${libgav1_source}/dsp/film_grain.h"
+ "${libgav1_source}/dsp/film_grain_common.h"
+ "${libgav1_source}/dsp/intra_edge.cc"
+ "${libgav1_source}/dsp/intra_edge.h"
+ "${libgav1_source}/dsp/intrapred.cc"
+ "${libgav1_source}/dsp/intrapred.h"
+ "${libgav1_source}/dsp/inverse_transform.cc"
+ "${libgav1_source}/dsp/inverse_transform.h"
+ "${libgav1_source}/dsp/inverse_transform.inc"
+ "${libgav1_source}/dsp/loop_filter.cc"
+ "${libgav1_source}/dsp/loop_filter.h"
+ "${libgav1_source}/dsp/loop_restoration.cc"
+ "${libgav1_source}/dsp/loop_restoration.h"
+ "${libgav1_source}/dsp/mask_blend.cc"
+ "${libgav1_source}/dsp/mask_blend.h"
+ "${libgav1_source}/dsp/motion_field_projection.cc"
+ "${libgav1_source}/dsp/motion_field_projection.h"
+ "${libgav1_source}/dsp/motion_vector_search.cc"
+ "${libgav1_source}/dsp/motion_vector_search.h"
+ "${libgav1_source}/dsp/obmc.cc"
+ "${libgav1_source}/dsp/obmc.h"
+ "${libgav1_source}/dsp/obmc.inc"
+ "${libgav1_source}/dsp/super_res.cc"
+ "${libgav1_source}/dsp/super_res.h"
+ "${libgav1_source}/dsp/warp.cc"
+ "${libgav1_source}/dsp/warp.h"
+ "${libgav1_source}/dsp/weight_mask.cc"
+ "${libgav1_source}/dsp/weight_mask.h")
+
+list(APPEND libgav1_dsp_sources_avx2
+ ${libgav1_dsp_sources_avx2}
+ "${libgav1_source}/dsp/x86/convolve_avx2.cc"
+ "${libgav1_source}/dsp/x86/convolve_avx2.h"
+ "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_avx2.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_avx2.h")
+
+list(APPEND libgav1_dsp_sources_neon
+ ${libgav1_dsp_sources_neon}
+ "${libgav1_source}/dsp/arm/average_blend_neon.cc"
+ "${libgav1_source}/dsp/arm/average_blend_neon.h"
+ "${libgav1_source}/dsp/arm/cdef_neon.cc"
+ "${libgav1_source}/dsp/arm/cdef_neon.h"
+ "${libgav1_source}/dsp/arm/common_neon.h"
+ "${libgav1_source}/dsp/arm/convolve_neon.cc"
+ "${libgav1_source}/dsp/arm/convolve_neon.h"
+ "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.cc"
+ "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.h"
+ "${libgav1_source}/dsp/arm/film_grain_neon.cc"
+ "${libgav1_source}/dsp/arm/film_grain_neon.h"
+ "${libgav1_source}/dsp/arm/intra_edge_neon.cc"
+ "${libgav1_source}/dsp/arm/intra_edge_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_filter_intra_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+ "${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
+ "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
+ "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
+ "${libgav1_source}/dsp/arm/loop_filter_neon.h"
+ "${libgav1_source}/dsp/arm/loop_restoration_neon.cc"
+ "${libgav1_source}/dsp/arm/loop_restoration_neon.h"
+ "${libgav1_source}/dsp/arm/mask_blend_neon.cc"
+ "${libgav1_source}/dsp/arm/mask_blend_neon.h"
+ "${libgav1_source}/dsp/arm/motion_field_projection_neon.cc"
+ "${libgav1_source}/dsp/arm/motion_field_projection_neon.h"
+ "${libgav1_source}/dsp/arm/motion_vector_search_neon.cc"
+ "${libgav1_source}/dsp/arm/motion_vector_search_neon.h"
+ "${libgav1_source}/dsp/arm/obmc_neon.cc"
+ "${libgav1_source}/dsp/arm/obmc_neon.h"
+ "${libgav1_source}/dsp/arm/super_res_neon.cc"
+ "${libgav1_source}/dsp/arm/super_res_neon.h"
+ "${libgav1_source}/dsp/arm/warp_neon.cc"
+ "${libgav1_source}/dsp/arm/warp_neon.h"
+ "${libgav1_source}/dsp/arm/weight_mask_neon.cc"
+ "${libgav1_source}/dsp/arm/weight_mask_neon.h")
+
+list(APPEND libgav1_dsp_sources_sse4
+ ${libgav1_dsp_sources_sse4}
+ "${libgav1_source}/dsp/x86/average_blend_sse4.cc"
+ "${libgav1_source}/dsp/x86/average_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/common_sse4.h"
+ "${libgav1_source}/dsp/x86/cdef_sse4.cc"
+ "${libgav1_source}/dsp/x86/cdef_sse4.h"
+ "${libgav1_source}/dsp/x86/convolve_sse4.cc"
+ "${libgav1_source}/dsp/x86/convolve_sse4.h"
+ "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
+ "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
+ "${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+ "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
+ "${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
+ "${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
+ "${libgav1_source}/dsp/x86/loop_filter_sse4.h"
+ "${libgav1_source}/dsp/x86/loop_restoration_10bit_sse4.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_sse4.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
+ "${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
+ "${libgav1_source}/dsp/x86/mask_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc"
+ "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h"
+ "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc"
+ "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h"
+ "${libgav1_source}/dsp/x86/obmc_sse4.cc"
+ "${libgav1_source}/dsp/x86/obmc_sse4.h"
+ "${libgav1_source}/dsp/x86/super_res_sse4.cc"
+ "${libgav1_source}/dsp/x86/super_res_sse4.h"
+ "${libgav1_source}/dsp/x86/transpose_sse4.h"
+ "${libgav1_source}/dsp/x86/warp_sse4.cc"
+ "${libgav1_source}/dsp/x86/warp_sse4.h"
+ "${libgav1_source}/dsp/x86/weight_mask_sse4.cc"
+ "${libgav1_source}/dsp/x86/weight_mask_sse4.h")
+
+macro(libgav1_add_dsp_targets)
+ unset(dsp_sources)
+ list(APPEND dsp_sources ${libgav1_dsp_sources}
+ ${libgav1_dsp_sources_neon}
+ ${libgav1_dsp_sources_avx2}
+ ${libgav1_dsp_sources_sse4})
+
+ libgav1_add_library(NAME
+ libgav1_dsp
+ TYPE
+ OBJECT
+ SOURCES
+ ${dsp_sources}
+ DEFINES
+ ${libgav1_defines}
+ $<$<CONFIG:Debug>:LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS>
+ INCLUDES
+ ${libgav1_include_paths})
+endmacro()
diff --git a/src/dsp/loop_filter.cc b/src/dsp/loop_filter.cc
new file mode 100644
index 0000000..6cad97d
--- /dev/null
+++ b/src/dsp/loop_filter.cc
@@ -0,0 +1,616 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// 7.14.6.1.
+template <int bitdepth, typename Pixel>
+struct LoopFilterFuncs_C {
+ LoopFilterFuncs_C() = delete;
+
+ static constexpr int kMaxPixel = (1 << bitdepth) - 1;
+ static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1));
+ static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
+ static constexpr int kFlatThresh = 1 << (bitdepth - 8);
+
+ static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+};
+
+inline void AdjustThresholds(const int bitdepth, int* const outer_thresh,
+ int* const inner_thresh, int* const hev_thresh) {
+ *outer_thresh <<= bitdepth - 8;
+ *inner_thresh <<= bitdepth - 8;
+ *hev_thresh <<= bitdepth - 8;
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter4(const Pixel* p, ptrdiff_t step, int outer_thresh,
+ int inner_thresh) {
+ const int p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step];
+ return std::abs(p1 - p0) <= inner_thresh &&
+ std::abs(q1 - q0) <= inner_thresh &&
+ std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool Hev(const Pixel* p, ptrdiff_t step, int thresh) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ return (std::abs(p1 - p0) > thresh) || (std::abs(q1 - q0) > thresh);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 2 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter2_C(Pixel* p, ptrdiff_t step) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ const int min_signed_val =
+ LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+ const int max_signed_val =
+ LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+ // 8bpp: [-893,892], 10bpp: [-3581,3580], 12bpp [-14333,14332]
+ const int a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ // 8bpp: [-16,15], 10bpp: [-64,63], 12bpp: [-256,255]
+ const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+ const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+ p[-step] = Clip3(p0 + a2, 0, max_unsigned_val);
+ p[0] = Clip3(q0 - a1, 0, max_unsigned_val);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 4 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter4_C(Pixel* p, ptrdiff_t step) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ const int a = 3 * (q0 - p0);
+ const int min_signed_val =
+ LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+ const int max_signed_val =
+ LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+ const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+ const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int a3 = (a1 + 1) >> 1;
+ const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+ p[-2 * step] = Clip3(p1 + a3, 0, max_unsigned_val);
+ p[-1 * step] = Clip3(p0 + a2, 0, max_unsigned_val);
+ p[0 * step] = Clip3(q0 - a1, 0, max_unsigned_val);
+ p[1 * step] = Clip3(q1 - a3, 0, max_unsigned_val);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical4(void* dest, ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter4(dst, 1, outer_thresh, inner_thresh)) {
+ if (Hev(dst, 1, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, 1);
+ } else {
+ Filter4_C<bitdepth>(dst, 1);
+ }
+ }
+ dst += stride;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal4(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter4(dst, stride, outer_thresh, inner_thresh)) {
+ if (Hev(dst, stride, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, stride);
+ } else {
+ Filter4_C<bitdepth>(dst, stride);
+ }
+ }
+ ++dst;
+ }
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter6(const Pixel* p, ptrdiff_t step, int outer_thresh,
+ int inner_thresh) {
+ const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+ return std::abs(p2 - p1) <= inner_thresh &&
+ std::abs(p1 - p0) <= inner_thresh &&
+ std::abs(q1 - q0) <= inner_thresh &&
+ std::abs(q2 - q1) <= inner_thresh &&
+ std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat3(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+ const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+ return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+ std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter6(int filter_value) {
+ return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 6 pixels in, 4 pixels out.
+template <typename Pixel>
+inline void Filter6_C(Pixel* p, ptrdiff_t step) {
+ const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+ const int a1 = 2 * p1;
+ const int a0 = 2 * p0;
+ const int b0 = 2 * q0;
+ const int b1 = 2 * q1;
+ // The max is 8 * max_pixel + 4 for the rounder.
+ // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+ p[-2 * step] = ApplyFilter6<Pixel>(3 * p2 + a1 + a0 + q0);
+ p[-1 * step] = ApplyFilter6<Pixel>(p2 + a1 + a0 + b0 + q1);
+ p[0 * step] = ApplyFilter6<Pixel>(p1 + a0 + b0 + b1 + q2);
+ p[1 * step] = ApplyFilter6<Pixel>(p0 + b0 + b1 + 3 * q2);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical6(void* dest, ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter6(dst, 1, outer_thresh, inner_thresh)) {
+ if (IsFlat3(dst, 1, flat_thresh)) {
+ Filter6_C(dst, 1);
+ } else if (Hev(dst, 1, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, 1);
+ } else {
+ Filter4_C<bitdepth>(dst, 1);
+ }
+ }
+ dst += stride;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal6(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter6(dst, stride, outer_thresh, inner_thresh)) {
+ if (IsFlat3(dst, stride, flat_thresh)) {
+ Filter6_C(dst, stride);
+ } else if (Hev(dst, stride, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, stride);
+ } else {
+ Filter4_C<bitdepth>(dst, stride);
+ }
+ }
+ ++dst;
+ }
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter8(const Pixel* p, ptrdiff_t step, int outer_thresh,
+ int inner_thresh) {
+ const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+ return std::abs(p3 - p2) <= inner_thresh &&
+ std::abs(p2 - p1) <= inner_thresh &&
+ std::abs(p1 - p0) <= inner_thresh &&
+ std::abs(q1 - q0) <= inner_thresh &&
+ std::abs(q2 - q1) <= inner_thresh &&
+ std::abs(q3 - q2) <= inner_thresh &&
+ std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+ const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+ return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+ std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh &&
+ std::abs(p3 - p0) <= flat_thresh && std::abs(q3 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter8(int filter_value) {
+ return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 8 pixels in, 6 pixels out.
+template <typename Pixel>
+inline void Filter8_C(Pixel* p, ptrdiff_t step) {
+ const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+ // The max is 8 * max_pixel + 4 for the rounder.
+ // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+ p[-3 * step] = ApplyFilter8<Pixel>(3 * p3 + 2 * p2 + p1 + p0 + q0);
+ p[-2 * step] = ApplyFilter8<Pixel>(2 * p3 + p2 + 2 * p1 + p0 + q0 + q1);
+ p[-1 * step] = ApplyFilter8<Pixel>(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2);
+ p[0 * step] = ApplyFilter8<Pixel>(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3);
+ p[1 * step] = ApplyFilter8<Pixel>(p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3);
+ p[2 * step] = ApplyFilter8<Pixel>(p0 + q0 + q1 + 2 * q2 + 3 * q3);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical8(void* dest, ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+ if (IsFlat4(dst, 1, flat_thresh)) {
+ Filter8_C(dst, 1);
+ } else if (Hev(dst, 1, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, 1);
+ } else {
+ Filter4_C<bitdepth>(dst, 1);
+ }
+ }
+ dst += stride;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal8(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+ if (IsFlat4(dst, stride, flat_thresh)) {
+ Filter8_C(dst, stride);
+ } else if (Hev(dst, stride, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, stride);
+ } else {
+ Filter4_C<bitdepth>(dst, stride);
+ }
+ }
+ ++dst;
+ }
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlatOuter4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+ const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+ return std::abs(p4 - p0) <= flat_thresh && std::abs(q4 - q0) <= flat_thresh &&
+ std::abs(p5 - p0) <= flat_thresh && std::abs(q5 - q0) <= flat_thresh &&
+ std::abs(p6 - p0) <= flat_thresh && std::abs(q6 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter14(int filter_value) {
+ return static_cast<Pixel>(RightShiftWithRounding(filter_value, 4));
+}
+
+// 7.14.6.4.
+// 14 pixels in, 12 pixels out.
+template <typename Pixel>
+inline void Filter14_C(Pixel* p, ptrdiff_t step) {
+ const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+ p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step],
+ q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+ // The max is 16 * max_pixel + 8 for the rounder.
+ // 8bpp: 4088 (12 bits), 10bpp: 16376 (14 bits), 12bpp: 65528 (16 bits)
+ p[-6 * step] =
+ ApplyFilter14<Pixel>(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0);
+ p[-5 * step] = ApplyFilter14<Pixel>(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 +
+ p1 + p0 + q0 + q1);
+ p[-4 * step] = ApplyFilter14<Pixel>(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 +
+ p1 + p0 + q0 + q1 + q2);
+ p[-3 * step] = ApplyFilter14<Pixel>(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 +
+ p1 * 2 + p0 + q0 + q1 + q2 + q3);
+ p[-2 * step] = ApplyFilter14<Pixel>(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+ p0 * 2 + q0 + q1 + q2 + q3 + q4);
+ p[-1 * step] = ApplyFilter14<Pixel>(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+ q0 * 2 + q1 + q2 + q3 + q4 + q5);
+ p[0 * step] = ApplyFilter14<Pixel>(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+ q1 * 2 + q2 + q3 + q4 + q5 + q6);
+ p[1 * step] = ApplyFilter14<Pixel>(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+ q2 * 2 + q3 + q4 + q5 + q6 * 2);
+ p[2 * step] = ApplyFilter14<Pixel>(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+ q3 * 2 + q4 + q5 + q6 * 3);
+ p[3 * step] = ApplyFilter14<Pixel>(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+ q4 * 2 + q5 + q6 * 4);
+ p[4 * step] = ApplyFilter14<Pixel>(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+ q5 * 2 + q6 * 5);
+ p[5 * step] =
+ ApplyFilter14<Pixel>(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical14(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+ if (IsFlat4(dst, 1, flat_thresh)) {
+ if (IsFlatOuter4(dst, 1, flat_thresh)) {
+ Filter14_C(dst, 1);
+ } else {
+ Filter8_C(dst, 1);
+ }
+ } else if (Hev(dst, 1, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, 1);
+ } else {
+ Filter4_C<bitdepth>(dst, 1);
+ }
+ }
+ dst += stride;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal14(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+ if (IsFlat4(dst, stride, flat_thresh)) {
+ if (IsFlatOuter4(dst, stride, flat_thresh)) {
+ Filter14_C(dst, stride);
+ } else {
+ Filter8_C(dst, stride);
+ }
+ } else if (Hev(dst, stride, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, stride);
+ } else {
+ Filter4_C<bitdepth>(dst, stride);
+ }
+ }
+ ++dst;
+ }
+}
+
+using Defs8bpp = LoopFilterFuncs_C<8, uint8_t>;
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal4;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical4;
+
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal6;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical6;
+
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal8;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical8;
+
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal14;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical14;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical14;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using Defs10bpp = LoopFilterFuncs_C<10, uint16_t>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal4;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical4;
+
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal6;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical6;
+
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal8;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical8;
+
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal14;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical14;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical14;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+
+void LoopFilterInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+ // Local functions that may be unused depending on the optimizations
+ // available.
+ static_cast<void>(AdjustThresholds);
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/loop_filter.h b/src/dsp/loop_filter.h
new file mode 100644
index 0000000..1ddad71
--- /dev/null
+++ b/src/dsp/loop_filter.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+#define LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters. This function is not thread-safe.
+void LoopFilterInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_LOOP_FILTER_H_
diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc
new file mode 100644
index 0000000..0909df0
--- /dev/null
+++ b/src/dsp/loop_restoration.cc
@@ -0,0 +1,936 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Section 7.17.3.
+// a2: range [1, 256].
+// if (z >= 255)
+// a2 = 256;
+// else if (z == 0)
+// a2 = 1;
+// else
+// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
+// ma = 256 - a2;
+alignas(16) const uint8_t kSgrMaLookup[256] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
+ 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7,
+ 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 0};
+
+namespace {
+
+template <int bitdepth, typename Pixel>
+inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
+ const int width, const int height,
+ const int16_t* const filter,
+ const int number_zero_coefficients,
+ int16_t** wiener_buffer) {
+ constexpr int kCenterTap = kWienerFilterTaps / 2;
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int offset =
+ 1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ for (int y = 0; y < height; ++y) {
+ int x = 0;
+ do {
+ // sum fits into 16 bits only when bitdepth = 8.
+ int sum = 0;
+ for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+ sum +=
+ filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]);
+ }
+ sum += filter[kCenterTap] * source[x + kCenterTap];
+ const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
+ (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset);
+ } while (++x != width);
+ source += source_stride;
+ *wiener_buffer += width;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+inline void WienerVertical(const int16_t* wiener_buffer, const int width,
+ const int height, const int16_t* const filter,
+ const int number_zero_coefficients, void* const dest,
+ const ptrdiff_t dest_stride) {
+ constexpr int kCenterTap = kWienerFilterTaps / 2;
+ constexpr int kRoundBitsVertical =
+ (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+ auto* dst = static_cast<Pixel*>(dest);
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ // sum needs 32 bits.
+ int sum = 0;
+ for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+ sum += filter[k] *
+ (wiener_buffer[k * width + x] +
+ wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]);
+ }
+ sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
+ const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+ dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
+ } while (++x != width);
+ wiener_buffer += width;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+// Note: bit range for wiener filter.
+// Wiener filter process first applies horizontal filtering to input pixels,
+// followed by rounding with predefined bits (dependent on bitdepth).
+// Then vertical filtering is applied, followed by rounding (dependent on
+// bitdepth).
+// The process is the same as convolution:
+// <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
+// --> <rounding 1>
+// By design:
+// (a). horizontal/vertical filtering adds 7 bits to input.
+// (b). The output of first rounding fits into 16 bits.
+// (c). The output of second rounding fits into 16 bits.
+// If input bitdepth > 8, the accumulator of the horizontal filter is larger
+// than 16 bit and smaller than 32 bits.
+// The accumulator of the vertical filter is larger than 16 bits and smaller
+// than 32 bits.
+// Note: range of wiener filter coefficients.
+// Wiener filter coefficients are symmetric, and their sum is 1 (128).
+// The range of each coefficient:
+// filter[0] = filter[6], 4 bits, min = -5, max = 10.
+// filter[1] = filter[5], 5 bits, min = -23, max = 8.
+// filter[2] = filter[4], 6 bits, min = -17, max = 46.
+// filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]).
+// The difference from libaom is that in libaom:
+// filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
+// Thus in libaom's computation, an offset of 128 is needed for filter[3].
+template <int bitdepth, typename Pixel>
+void WienerFilter_C(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border, const ptrdiff_t stride,
+ const int width, const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ constexpr int kCenterTap = kWienerFilterTaps / 2;
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer;
+
+ // horizontal filtering.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const int16_t* const filter_horizontal =
+ restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+ const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
+ const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
+ const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
+ auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
+
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
+ width, height_extra, filter_horizontal, 0,
+ &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 0, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
+ filter_horizontal, 0, &wiener_buffer);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
+ width, height_extra, filter_horizontal, 1,
+ &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 1, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
+ filter_horizontal, 1, &wiener_buffer);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
+ width, height_extra, filter_horizontal, 2,
+ &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 2, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
+ filter_horizontal, 2, &wiener_buffer);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
+ width, height_extra, filter_horizontal, 3,
+ &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 3, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
+ filter_horizontal, 3, &wiener_buffer);
+ }
+
+ // vertical filtering.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer, wiener_buffer - width,
+ sizeof(*wiener_buffer) * width);
+ memcpy(wiener_buffer_org, wiener_buffer_org + width,
+ sizeof(*wiener_buffer) * width);
+ WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+ filter_vertical, 0, dest, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+ filter_vertical, 1, dest, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+ filter_vertical, 2, dest, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+ filter_vertical, 3, dest, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// When |height| is 1, |src_stride| could be set to arbitrary value.
+template <typename Pixel, int size>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+ const int height, const int width,
+ uint16_t* const* sums,
+ uint32_t* const* square_sums) {
+ int y = height;
+ do {
+ uint32_t sum = 0;
+ uint32_t square_sum = 0;
+ for (int dx = 0; dx < size; ++dx) {
+ const Pixel source = src[dx];
+ sum += source;
+ square_sum += source * source;
+ }
+ (*sums)[0] = sum;
+ (*square_sums)[0] = square_sum;
+ int x = 1;
+ do {
+ const Pixel source0 = src[x - 1];
+ const Pixel source1 = src[x - 1 + size];
+ sum -= source0;
+ sum += source1;
+ square_sum -= source0 * source0;
+ square_sum += source1 * source1;
+ (*sums)[x] = sum;
+ (*square_sums)[x] = square_sum;
+ } while (++x != width);
+ src += src_stride;
+ ++sums;
+ ++square_sums;
+ } while (--y != 0);
+}
+
+// When |height| is 1, |src_stride| could be set to arbitrary value.
+template <typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+ const int height, const int width,
+ uint16_t* const* sum3, uint16_t* const* sum5,
+ uint32_t* const* square_sum3,
+ uint32_t* const* square_sum5) {
+ int y = height;
+ do {
+ uint32_t sum = 0;
+ uint32_t square_sum = 0;
+ for (int dx = 0; dx < 4; ++dx) {
+ const Pixel source = src[dx];
+ sum += source;
+ square_sum += source * source;
+ }
+ int x = 0;
+ do {
+ const Pixel source0 = src[x];
+ const Pixel source1 = src[x + 4];
+ sum -= source0;
+ square_sum -= source0 * source0;
+ (*sum3)[x] = sum;
+ (*square_sum3)[x] = square_sum;
+ sum += source1;
+ square_sum += source1 * source1;
+ (*sum5)[x] = sum + source0;
+ (*square_sum5)[x] = square_sum + source0 * source0;
+ } while (++x != width);
+ src += src_stride;
+ ++sum3;
+ ++sum5;
+ ++square_sum3;
+ ++square_sum5;
+ } while (--y != 0);
+}
+
+template <int bitdepth, int n>
+inline void CalculateIntermediate(const uint32_t s, uint32_t a,
+ const uint32_t b, uint8_t* const ma_ptr,
+ uint32_t* const b_ptr) {
+ // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
+ // since max bitdepth = 12, max < 2^31.
+ // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
+ a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
+ // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
+ // d < 2^8 * n < 2^14 regardless of bitdepth
+ const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
+ // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+ // and p itself satisfies p < 2^14 * n^2 < 2^26.
+ // This bound on p is due to:
+ // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+ // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
+ // This is an artifact of rounding, and can only happen if all pixels
+ // are (almost) identical, so in this case we saturate to p=0.
+ const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
+ // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
+ // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
+ // (this holds even after accounting for the rounding in s)
+ const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+ // ma: range [0, 255].
+ const uint32_t ma = kSgrMaLookup[std::min(z, 255u)];
+ const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ // ma < 2^8, b < 2^(bitdepth) * n,
+ // one_over_n = round(2^12 / n)
+ // => the product here is < 2^(20 + bitdepth) <= 2^32,
+ // and b is set to a value < 2^(8 + bitdepth).
+ // This holds even with the rounding in one_over_n and in the overall result,
+ // as long as ma is strictly less than 2^8.
+ const uint32_t b2 = ma * b * one_over_n;
+ *ma_ptr = ma;
+ *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+}
+
+template <typename T>
+inline uint32_t Sum343(const T* const src) {
+ return 3 * (src[0] + src[2]) + 4 * src[1];
+}
+
+template <typename T>
+inline uint32_t Sum444(const T* const src) {
+ return 4 * (src[0] + src[1] + src[2]);
+}
+
+template <typename T>
+inline uint32_t Sum565(const T* const src) {
+ return 5 * (src[0] + src[2]) + 6 * src[1];
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ const int width, const uint32_t s, SgrBuffer* const sgr_buffer,
+ uint16_t* const ma565, uint32_t* const b565) {
+ int x = 0;
+ do {
+ uint32_t a = 0;
+ uint32_t b = 0;
+ for (int dy = 0; dy < 5; ++dy) {
+ a += square_sum5[dy][x];
+ b += sum5[dy][x];
+ }
+ CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x,
+ sgr_buffer->b + x);
+ } while (++x != width + 2);
+ x = 0;
+ do {
+ ma565[x] = Sum565(sgr_buffer->ma + x);
+ b565[x] = Sum565(sgr_buffer->b + x);
+ } while (++x != width);
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint16_t* const sum3[3], const uint32_t* const square_sum3[3],
+ const int width, const uint32_t s, const bool calculate444,
+ SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343,
+ uint16_t* const ma444, uint32_t* const b444) {
+ int x = 0;
+ do {
+ uint32_t a = 0;
+ uint32_t b = 0;
+ for (int dy = 0; dy < 3; ++dy) {
+ a += square_sum3[dy][x];
+ b += sum3[dy][x];
+ }
+ CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x,
+ sgr_buffer->b + x);
+ } while (++x != width + 2);
+ x = 0;
+ do {
+ ma343[x] = Sum343(sgr_buffer->ma + x);
+ b343[x] = Sum343(sgr_buffer->b + x);
+ } while (++x != width);
+ if (calculate444) {
+ x = 0;
+ do {
+ ma444[x] = Sum444(sgr_buffer->ma + x);
+ b444[x] = Sum444(sgr_buffer->b + x);
+ } while (++x != width);
+ }
+}
+
+template <typename Pixel>
+inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma,
+ const uint32_t b, const int shift) {
+ const int32_t v = b - ma * src;
+ return RightShiftWithRounding(v,
+ kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <typename Pixel>
+inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
+ const uint16_t* const ma565[2],
+ const uint32_t* const b565[2],
+ const ptrdiff_t x, int p[2]) {
+ p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
+ b565[0][x] + b565[1][x], 5);
+ p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
+}
+
+template <typename Pixel>
+inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
+ const uint16_t* const ma444,
+ const uint32_t* const b343[3],
+ const uint32_t* const b444, const ptrdiff_t x) {
+ const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
+ const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
+ return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedFinal(const int src, const int v) {
+ // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+ // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+ // Then, range of s is bitdepth + 2. This is a rough estimation, taking the
+ // maximum value of each element.
+ const int s = src + RightShiftWithRounding(
+ v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0,
+ const int filter1, const int16_t w0,
+ const int16_t w2) {
+ const int v = w0 * filter0 + w2 * filter1;
+ return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter,
+ const int16_t w0) {
+ const int v = w0 * filter;
+ return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width,
+ const uint32_t scale, const int16_t w0,
+ SgrBuffer* const sgr_buffer,
+ uint16_t* const ma565[2], uint32_t* const b565[2],
+ Pixel* dst) {
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+ ma565[1], b565[1]);
+ int x = 0;
+ do {
+ int p[2];
+ BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
+ dst[stride + x] =
+ SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
+ } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
+ const int width, const uint16_t scale,
+ const int16_t w0, uint16_t* const sum3[4],
+ uint32_t* const square_sum3[4],
+ SgrBuffer* const sgr_buffer,
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint32_t* const b343[4], uint32_t* const b444[3],
+ Pixel* dst) {
+ BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+ sgr_buffer, ma343[2], b343[2], ma444[1],
+ b444[1]);
+ int x = 0;
+ do {
+ const int p =
+ BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+ } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], const int width,
+ const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, SgrBuffer* const sgr_buffer,
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2],
+ Pixel* dst) {
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+ sgr_buffer, ma565[1], b565[1]);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
+ sgr_buffer, ma343[2], b343[2], ma444[1],
+ b444[1]);
+ BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+ true, sgr_buffer, ma343[3], b343[3], ma444[2],
+ b444[2]);
+ int x = 0;
+ do {
+ int p[2][2];
+ BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
+ p[1][0] =
+ BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+ p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
+ b343 + 1, b444[1], x);
+ dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+ p[1][0], w0, w2);
+ dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+ src[stride + x], p[0][1], p[1][1], w0, w2);
+ } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
+ const Pixel* src, const Pixel* const top_border,
+ const Pixel* bottom_border, const ptrdiff_t stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum<Pixel>(top_border, stride, 2, width + 2, sum3, sum5 + 1, square_sum3,
+ square_sum5 + 1);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+ square_sum5 + 3);
+ const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
+ square_sum5 + 4);
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+ sgr_buffer, ma565[0], b565[0]);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+ sgr_buffer, ma343[0], b343[0], nullptr,
+ nullptr);
+ BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+ true, sgr_buffer, ma343[1], b343[1], ma444[0],
+ b444[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
+ square_sum3 + 2, square_sum5 + 3);
+ BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+ square_sum5, width, scales, w0, w2, sgr_buffer,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const Pixel* sr;
+ ptrdiff_t s_stride;
+ if ((height & 1) == 0) {
+ sr = bottom_border;
+ s_stride = stride;
+ } else {
+ sr = src + 2 * stride;
+ s_stride = bottom_border - (src + 2 * stride);
+ }
+ BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
+ square_sum3 + 2, square_sum5 + 3);
+ BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+ square_sum5, width, scales, w0, w2, sgr_buffer,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxSum<Pixel>(bottom_border + stride, stride, 1, width + 2, sum3 + 2,
+ sum5 + 3, square_sum3 + 2, square_sum5 + 3);
+ sum5[4] = sum5[3];
+ square_sum5[4] = square_sum5[3];
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+ sgr_buffer, ma565[1], b565[1]);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+ sgr_buffer, ma343[2], b343[2], nullptr,
+ nullptr);
+ int x = 0;
+ do {
+ const int p0 = CalculateFilteredOutput<Pixel>(
+ src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+ const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
+ b444[0], x);
+ dst[x] =
+ SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
+ } while (++x != width);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const Pixel* src,
+ const Pixel* const top_border,
+ const Pixel* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ Pixel* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<Pixel, 5>(top_border, stride, 2, width + 2, sum5 + 1, square_sum5 + 1);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
+ const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
+ square_sum5 + 3);
+ BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+ scale, w0, sgr_buffer, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const Pixel* sr;
+ ptrdiff_t s_stride;
+ if ((height & 1) == 0) {
+ sr = bottom_border;
+ s_stride = stride;
+ } else {
+ sr = src + 2 * stride;
+ s_stride = bottom_border - (src + 2 * stride);
+ }
+ BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
+ BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+ scale, w0, sgr_buffer, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxSum<Pixel, 5>(bottom_border + stride, stride, 1, width + 2, sum5 + 3,
+ square_sum5 + 3);
+ sum5[4] = sum5[3];
+ square_sum5[4] = square_sum5[3];
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+ ma565[1], b565[1]);
+ int x = 0;
+ do {
+ const int p = CalculateFilteredOutput<Pixel>(
+ src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+ } while (++x != width);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const Pixel* src,
+ const Pixel* const top_border,
+ const Pixel* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ Pixel* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<Pixel, 3>(top_border, stride, 2, width + 2, sum3, square_sum3);
+ BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
+ sgr_buffer, ma343[0], b343[0], nullptr,
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const Pixel* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += stride;
+ }
+ BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+ sgr_buffer, ma343[1], b343[1], ma444[0],
+ b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
+ sum3, square_sum3, sgr_buffer, ma343, ma444,
+ b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ src += 2;
+ int y = std::min(height, 2);
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
+ square_sum3, sgr_buffer, ma343, ma444, b343,
+ b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border, const ptrdiff_t stride,
+ const int width, const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* src = static_cast<const Pixel*>(source);
+ const auto* top = static_cast<const Pixel*>(top_border);
+ const auto* bottom = static_cast<const Pixel*>(bottom_border);
+ auto* dst = static_cast<Pixel*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1<bitdepth, Pixel>(restoration_info, src - 3, top - 3,
+ bottom - 3, stride, width, height,
+ sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2<bitdepth, Pixel>(restoration_info, src - 2, top - 2,
+ bottom - 2, stride, width, height,
+ sgr_buffer, dst);
+ } else {
+ BoxFilterProcess<bitdepth, Pixel>(restoration_info, src - 3, top - 3,
+ bottom - 3, stride, width, height,
+ sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+ dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+ dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+} // namespace
+
+void LoopRestorationInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/loop_restoration.h b/src/dsp/loop_restoration.h
new file mode 100644
index 0000000..de80926
--- /dev/null
+++ b/src/dsp/loop_restoration.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+#define LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_restoration_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_restoration_avx2.h"
+#include "src/dsp/x86/loop_restoration_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+enum {
+ // Precision of a division table (mtable)
+ kSgrProjScaleBits = 20,
+ kSgrProjReciprocalBits = 12,
+ // Core self-guided restoration precision bits.
+ kSgrProjSgrBits = 8,
+ // Precision bits of generated values higher than source before projection.
+ kSgrProjRestoreBits = 4
+}; // anonymous enum
+
+extern const uint8_t kSgrMaLookup[256];
+
+// Initializes Dsp::loop_restorations. This function is not thread-safe.
+void LoopRestorationInit_C();
+
+template <typename T>
+void Circulate3PointersBy1(T* p[3]) {
+ T* const p0 = p[0];
+ p[0] = p[1];
+ p[1] = p[2];
+ p[2] = p0;
+}
+
+template <typename T>
+void Circulate4PointersBy2(T* p[4]) {
+ std::swap(p[0], p[2]);
+ std::swap(p[1], p[3]);
+}
+
+template <typename T>
+void Circulate5PointersBy2(T* p[5]) {
+ T* const p0 = p[0];
+ T* const p1 = p[1];
+ p[0] = p[2];
+ p[1] = p[3];
+ p[2] = p[4];
+ p[3] = p0;
+ p[4] = p1;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc
new file mode 100644
index 0000000..101c410
--- /dev/null
+++ b/src/dsp/mask_blend.cc
@@ -0,0 +1,207 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int subsampling_x, int subsampling_y>
+uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x) {
+ if ((subsampling_x | subsampling_y) == 0) {
+ return mask[x];
+ }
+ if (subsampling_x == 1 && subsampling_y == 0) {
+ return static_cast<uint8_t>(RightShiftWithRounding(
+ mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1], 1));
+ }
+ assert(subsampling_x == 1 && subsampling_y == 1);
+ return static_cast<uint8_t>(RightShiftWithRounding(
+ mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1] +
+ mask_next_row[MultiplyBy2(x)] + mask_next_row[MultiplyBy2(x) + 1],
+ 2));
+}
+
+template <int bitdepth, typename Pixel, bool is_inter_intra, int subsampling_x,
+ int subsampling_y>
+void MaskBlend_C(const void* prediction_0, const void* prediction_1,
+ const ptrdiff_t prediction_stride_1, const uint8_t* mask,
+ const ptrdiff_t mask_stride, const int width, const int height,
+ void* dest, const ptrdiff_t dest_stride) {
+ static_assert(!(bitdepth == 8 && is_inter_intra), "");
+ assert(mask != nullptr);
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+ const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+ auto* dst = static_cast<Pixel*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+ constexpr int step_y = subsampling_y ? 2 : 1;
+ const uint8_t* mask_next_row = mask + mask_stride;
+ // 7.11.3.2 Rounding variables derivation process
+ // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+ constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const uint8_t mask_value =
+ GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+ if (is_inter_intra) {
+ dst[x] = static_cast<Pixel>(RightShiftWithRounding(
+ mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
+ } else {
+ assert(prediction_stride_1 == width);
+ int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+ res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+ dst[x] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ (1 << bitdepth) - 1));
+ }
+ }
+ dst += dst_stride;
+ mask += mask_stride * step_y;
+ mask_next_row += mask_stride * step_y;
+ pred_0 += width;
+ pred_1 += prediction_stride_1;
+ }
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_C(const uint8_t* prediction_0,
+ uint8_t* prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* mask, const ptrdiff_t mask_stride,
+ const int width, const int height) {
+ assert(mask != nullptr);
+ constexpr int step_y = subsampling_y ? 2 : 1;
+ const uint8_t* mask_next_row = mask + mask_stride;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const uint8_t mask_value =
+ GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+ prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
+ mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
+ 6));
+ }
+ mask += mask_stride * step_y;
+ mask_next_row += mask_stride * step_y;
+ prediction_0 += width;
+ prediction_1 += prediction_stride_1;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+ dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+ dsp->mask_blend[0][1] = nullptr;
+ dsp->mask_blend[1][1] = nullptr;
+ dsp->mask_blend[2][1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+ dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+ dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+ dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+#endif
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+ dsp->mask_blend[0][1] = nullptr;
+ dsp->mask_blend[1][1] = nullptr;
+ dsp->mask_blend[2][1] = nullptr;
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+ dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+ dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+ dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+ dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+ // These are only used with 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+ dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+ dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+ dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+ dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+ dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+ dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+#endif
+ // These are only used with 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void MaskBlendInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/mask_blend.h b/src/dsp/mask_blend.h
new file mode 100644
index 0000000..41f5e5b
--- /dev/null
+++ b/src/dsp/mask_blend.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MASK_BLEND_H_
+#define LIBGAV1_SRC_DSP_MASK_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/mask_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/mask_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend and Dsp::inter_intra_mask_blend_8bpp. This
+// function is not thread-safe.
+void MaskBlendInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_MASK_BLEND_H_
diff --git a/src/dsp/motion_field_projection.cc b/src/dsp/motion_field_projection.cc
new file mode 100644
index 0000000..b51ec8f
--- /dev/null
+++ b/src/dsp/motion_field_projection.cc
@@ -0,0 +1,138 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when MotionFieldProjectionKernel_C is
+// not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel) || \
+ (LIBGAV1_MAX_BITDEPTH >= 10 && \
+ !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel))
+
+// 7.9.2.
+void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info,
+ int reference_to_current_with_sign,
+ int dst_sign, int y8_start, int y8_end,
+ int x8_start, int x8_end,
+ TemporalMotionField* motion_field) {
+ const ptrdiff_t stride = motion_field->mv.columns();
+ // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+ // coordinates in that range could end up being position_x8 because of
+ // projection.
+ const int adjusted_x8_start =
+ std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+ const int adjusted_x8_end = std::min(
+ x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+ int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+ MotionVector* dst_mv = motion_field->mv[y8_start];
+ assert(stride == motion_field->reference_offset.columns());
+ assert((y8_start & 7) == 0);
+
+ int y8 = y8_start;
+ do {
+ const int y8_floor = (y8 & ~7) - y8;
+ const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);
+ int x8 = adjusted_x8_start;
+ do {
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
+ MotionVector projection_mv;
+ // reference_to_current_with_sign could be 0.
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
+ &projection_mv);
+ // Do not update the motion vector if the block position is not valid or
+ // if position_x8 is outside the current range of x8_start and x8_end.
+ // Note that position_y8 will always be within the range of y8_start and
+ // y8_end.
+ const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+ if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+ const int x8_base = x8 & ~7;
+ const int x8_floor =
+ std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+ const int x8_ceiling =
+ std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+ const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+ if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+ dst_mv[position_y8 * stride + position_x8] = mv[x8];
+ dst_reference_offset[position_y8 * stride + position_x8] =
+ reference_offsets[source_reference_type];
+ } while (++x8 < adjusted_x8_end);
+ source_reference_types += stride;
+ mv += stride;
+ dst_reference_offset += stride;
+ dst_mv += stride;
+ } while (++y8 < y8_end);
+}
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+ // !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel) ||
+ // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+ // !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel))
+
+void Init8bpp() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C;
+#endif
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel)
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C;
+#endif
+}
+#endif
+
+} // namespace
+
+void MotionFieldProjectionInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/motion_field_projection.h b/src/dsp/motion_field_projection.h
new file mode 100644
index 0000000..36de459
--- /dev/null
+++ b/src/dsp/motion_field_projection.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+#define LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_field_projection_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_field_projection_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
diff --git a/src/dsp/motion_vector_search.cc b/src/dsp/motion_vector_search.cc
new file mode 100644
index 0000000..9402302
--- /dev/null
+++ b/src/dsp/motion_vector_search.cc
@@ -0,0 +1,211 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when the C functions are not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch) || \
+ (LIBGAV1_MAX_BITDEPTH >= 10 && \
+ !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch))
+
+void MvProjectionCompoundLowPrecision_C(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* const candidate_mvs) {
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ int index = 0;
+ do {
+ candidate_mvs[index].mv64 = 0;
+ for (int i = 0; i < 2; ++i) {
+ // |offsets| non-zero check usually equals true and could be ignored.
+ if (offsets[i] != 0) {
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
+ for (auto& mv : candidate_mvs[index].mv[i].mv) {
+ // The next line is equivalent to:
+ // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+ mv = (mv - (mv >> 15)) & ~1;
+ }
+ }
+ }
+ } while (++index < count);
+}
+
+void MvProjectionCompoundForceInteger_C(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* const candidate_mvs) {
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ int index = 0;
+ do {
+ candidate_mvs[index].mv64 = 0;
+ for (int i = 0; i < 2; ++i) {
+ // |offsets| non-zero check usually equals true and could be ignored.
+ if (offsets[i] != 0) {
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
+ for (auto& mv : candidate_mvs[index].mv[i].mv) {
+ // The next line is equivalent to:
+ // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+ // const int sign = mv >> 15;
+ // mv = ApplySign(value, sign);
+ mv = (mv + 3 - (mv >> 15)) & ~7;
+ }
+ }
+ }
+ } while (++index < count);
+}
+
+void MvProjectionCompoundHighPrecision_C(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* const candidate_mvs) {
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ int index = 0;
+ do {
+ candidate_mvs[index].mv64 = 0;
+ for (int i = 0; i < 2; ++i) {
+ // |offsets| non-zero check usually equals true and could be ignored.
+ if (offsets[i] != 0) {
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
+ }
+ }
+ } while (++index < count);
+}
+
+void MvProjectionSingleLowPrecision_C(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets, const int reference_offset,
+ const int count, MotionVector* const candidate_mvs) {
+ int index = 0;
+ do {
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
+ for (auto& mv : candidate_mvs[index].mv) {
+ // The next line is equivalent to:
+ // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+ mv = (mv - (mv >> 15)) & ~1;
+ }
+ } while (++index < count);
+}
+
+void MvProjectionSingleForceInteger_C(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets, const int reference_offset,
+ const int count, MotionVector* const candidate_mvs) {
+ int index = 0;
+ do {
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
+ for (auto& mv : candidate_mvs[index].mv) {
+ // The next line is equivalent to:
+ // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+ // const int sign = mv >> 15;
+ // mv = ApplySign(value, sign);
+ mv = (mv + 3 - (mv >> 15)) & ~7;
+ }
+ } while (++index < count);
+}
+
+void MvProjectionSingleHighPrecision_C(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets, const int reference_offset,
+ const int count, MotionVector* const candidate_mvs) {
+ int index = 0;
+ do {
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
+ } while (++index < count);
+}
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+ // !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch) ||
+ // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+ // !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch))
+
+void Init8bpp() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C;
+#endif
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch)
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C;
+#endif
+}
+#endif
+
+} // namespace
+
+void MotionVectorSearchInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/motion_vector_search.h b/src/dsp/motion_vector_search.h
new file mode 100644
index 0000000..ae16726
--- /dev/null
+++ b/src/dsp/motion_vector_search.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+#define LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_vector_search_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_vector_search_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
diff --git a/src/dsp/obmc.cc b/src/dsp/obmc.cc
new file mode 100644
index 0000000..46d1b5b
--- /dev/null
+++ b/src/dsp/obmc.cc
@@ -0,0 +1,125 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+// 7.11.3.10 (from top samples).
+template <typename Pixel>
+void OverlapBlendVertical_C(void* const prediction,
+ const ptrdiff_t prediction_stride, const int width,
+ const int height, const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<Pixel*>(prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+ const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+ const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+ const uint8_t* const mask = kObmcMask + height - 2;
+
+ for (int y = 0; y < height; ++y) {
+ const uint8_t mask_value = mask[y];
+ for (int x = 0; x < width; ++x) {
+ pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+ mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+ }
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ }
+}
+
+// 7.11.3.10 (from left samples).
+template <typename Pixel>
+void OverlapBlendHorizontal_C(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<Pixel*>(prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+ const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+ const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+ const uint8_t* const mask = kObmcMask + width - 2;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const uint8_t mask_value = mask[x];
+ pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+ mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+ }
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+ dsp->obmc_blend[kObmcDirectionHorizontal] =
+ OverlapBlendHorizontal_C<uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+ dsp->obmc_blend[kObmcDirectionHorizontal] =
+ OverlapBlendHorizontal_C<uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void ObmcInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/obmc.h b/src/dsp/obmc.h
new file mode 100644
index 0000000..3b826c7
--- /dev/null
+++ b/src/dsp/obmc.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_OBMC_H_
+#define LIBGAV1_SRC_DSP_OBMC_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/obmc_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/obmc_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_OBMC_H_
diff --git a/src/dsp/obmc.inc b/src/dsp/obmc.inc
new file mode 100644
index 0000000..001c6ee
--- /dev/null
+++ b/src/dsp/obmc.inc
@@ -0,0 +1,32 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for overlap blend implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// This is a flat array of masks for each block dimension from 2 to 32. The
+// starting index for each length is length-2.
+constexpr uint8_t kObmcMask[62] = {
+ // Obmc Mask 2
+ 45, 64,
+ // Obmc Mask 4
+ 39, 50, 59, 64,
+ // Obmc Mask 8
+ 36, 42, 48, 53, 57, 61, 64, 64,
+ // Obmc Mask 16
+ 34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
+ // Obmc Mask 32
+ 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
+ 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
new file mode 100644
index 0000000..d041bd1
--- /dev/null
+++ b/src/dsp/super_res.cc
@@ -0,0 +1,109 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cassert>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void SuperRes_C(const void* /*coefficients*/, void* const source,
+ const ptrdiff_t stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest) {
+ assert(step <= 1 << kSuperResScaleBits);
+ auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<Pixel*>(dest);
+ int y = height;
+ do {
+ ExtendLine<Pixel>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ // If (original) upscaled_width is <= 9, the downscaled_width may be
+ // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
+ // subsampled via RightShiftWithRounding. This leads to an edge case where
+ // |step| == 1 << 14.
+ int subpixel_x = initial_subpixel_x;
+ int x = 0;
+ do {
+ int sum = 0;
+ const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
+ const int src_x_subpixel =
+ (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
+ // The sign of each tap is: - + - + + - + -
+ sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
+ sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
+ sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
+ sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
+ sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
+ sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
+ sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
+ sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
+ dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0,
+ (1 << bitdepth) - 1);
+ subpixel_x += step;
+ } while (++x < upscaled_width);
+ src += stride;
+ dst += stride;
+ } while (--y != 0);
+}
+
+void Init8bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->super_res = SuperRes_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+ dsp->super_res = SuperRes_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->super_res = SuperRes_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+ dsp->super_res = SuperRes_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void SuperResInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/super_res.h b/src/dsp/super_res.h
new file mode 100644
index 0000000..2ca9d2b
--- /dev/null
+++ b/src/dsp/super_res.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_SUPER_RES_H_
+#define LIBGAV1_SRC_DSP_SUPER_RES_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/super_res_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/super_res_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_SUPER_RES_H_
diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc
new file mode 100644
index 0000000..fbde65a
--- /dev/null
+++ b/src/dsp/warp.cc
@@ -0,0 +1,475 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// Warp prediction output ranges from WarpTest.ShowRange.
+// Bitdepth: 8 Input range: [ 0, 255]
+// 8bpp intermediate offset: 16384.
+// intermediate range: [ 4399, 61009]
+// first pass output range: [ 550, 7626]
+// 8bpp intermediate offset removal: 262144.
+// intermediate range: [ -620566, 1072406]
+// second pass output range: [ 0, 255]
+// compound second pass output range: [ -4848, 8378]
+//
+// Bitdepth: 10 Input range: [ 0, 1023]
+// intermediate range: [ -48081, 179025]
+// first pass output range: [ -6010, 22378]
+// intermediate range: [-2103516, 4198620]
+// second pass output range: [ 0, 1023]
+// compound second pass output range: [ 8142, 57378]
+//
+// Bitdepth: 12 Input range: [ 0, 4095]
+// intermediate range: [ -192465, 716625]
+// first pass output range: [ -6015, 22395]
+// intermediate range: [-2105190, 4201830]
+// second pass output range: [ 0, 4095]
+// compound second pass output range: [ 8129, 57403]
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void Warp_C(const void* const source, ptrdiff_t source_stride,
+ const int source_width, const int source_height,
+ const int* const warp_params, const int subsampling_x,
+ const int subsampling_y, const int block_start_x,
+ const int block_start_y, const int block_width,
+ const int block_height, const int16_t alpha, const int16_t beta,
+ const int16_t gamma, const int16_t delta, void* dest,
+ ptrdiff_t dest_stride) {
+ assert(block_width >= 8 && block_height >= 8);
+ if (is_compound) {
+ assert(dest_stride == block_width);
+ }
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical
+ : (bitdepth == 12) ? kInterRoundBitsVertical12bpp
+ : kInterRoundBitsVertical;
+
+ // Only used for 8bpp. Allows for keeping the first pass intermediates within
+ // uint16_t. With 10/12bpp the intermediate value will always require int32_t.
+ constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0;
+ constexpr int offset_removal =
+ (first_pass_offset >> kRoundBitsHorizontal) * 128;
+
+ constexpr int kMaxPixel = (1 << bitdepth) - 1;
+ union {
+ // |intermediate_result| is the output of the horizontal filtering and
+ // rounding. The range is within int16_t.
+ int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
+ // In the simple special cases where the samples in each row are all the
+ // same, store one sample per row in a column vector.
+ int16_t intermediate_result_column[15];
+ };
+ const auto* const src = static_cast<const Pixel*>(source);
+ source_stride /= sizeof(Pixel);
+ using DestType =
+ typename std::conditional<is_compound, uint16_t, Pixel>::type;
+ auto* dst = static_cast<DestType*>(dest);
+ if (!is_compound) dest_stride /= sizeof(dst[0]);
+
+ assert(block_width >= 8);
+ assert(block_height >= 8);
+
+ // Warp process applies for each 8x8 block (or smaller).
+ for (int start_y = block_start_y; start_y < block_start_y + block_height;
+ start_y += 8) {
+ for (int start_x = block_start_x; start_x < block_start_x + block_width;
+ start_x += 8) {
+ const int src_x = (start_x + 4) << subsampling_x;
+ const int src_y = (start_y + 4) << subsampling_y;
+ const int dst_x =
+ src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+ const int dst_y =
+ src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+ const int x4 = dst_x >> subsampling_x;
+ const int y4 = dst_y >> subsampling_y;
+ const int ix4 = x4 >> kWarpedModelPrecisionBits;
+ const int iy4 = y4 >> kWarpedModelPrecisionBits;
+
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ // Regions 1 and 2.
+ // Points to the left or right border of the first row of |src|.
+ const Pixel* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const Pixel row_border_pixel = first_row_border[row * source_stride];
+ DestType* dst_row = dst + start_x - block_start_x;
+ if (is_compound) {
+ int sum = row_border_pixel
+ << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ Memset(dst_row, sum, 8);
+ } else {
+ Memset(dst_row, row_border_pixel, 8);
+ }
+ const DestType* const first_dst_row = dst_row;
+ dst_row += dest_stride;
+ for (int y = 1; y < 8; ++y) {
+ memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+ dst_row += dest_stride;
+ }
+ // End of region 1. Continue the |start_x| for loop.
+ continue;
+ }
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved below.
+ const int row = iy4 + y;
+ int sum = first_row_border[row * source_stride];
+ sum <<= kFilterBits - kRoundBitsHorizontal;
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 =
+ (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ assert(offset >= 0);
+ assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+ int sum = 0;
+ for (int k = 0; k < 8; ++k) {
+ sum +=
+ kWarpedFilters[offset][k] * intermediate_result_column[y + k];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+ if (is_compound) {
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dst_row[x] = static_cast<DestType>(sum);
+ } else {
+ dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+ }
+ sy += gamma;
+ }
+ dst_row += dest_stride;
+ sy4 += delta;
+ }
+ // End of region 2. Continue the |start_x| for loop.
+ continue;
+ }
+
+ // Regions 3 and 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+ // It follows that -6 <= ix4 <= source_width + 5. This inequality is
+ // used below.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ // Region 3.
+ // Horizontal filter.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const Pixel* const src_row = src + row * source_stride;
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ int sx = sx4 - MultiplyBy4(alpha);
+ for (int x = -4; x < 4; ++x) {
+ const int offset =
+ RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ // Since alpha and beta have been validated by SetupShear(), one
+ // can prove that 0 <= offset <= 3 * 2^6.
+ assert(offset >= 0);
+ assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+ // For SIMD optimization:
+ // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+ // For 10/12 bit, the range of sum requires 32 bits.
+ int sum = first_pass_offset;
+ for (int k = 0; k < 8; ++k) {
+ // We assume the source frame has left and right borders of at
+ // least 13 pixels that extend the frame boundary pixels.
+ //
+ // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+ // ix4 above, we have
+ // -13 <= ix4 + x + k - 3 <= source_width + 12,
+ // or
+ // -13 <= column <= (source_width - 1) + 13.
+ // Therefore we may over-read up to 13 pixels before the source
+ // row, or up to 13 pixels after the source row.
+ const int column = ix4 + x + k - 3;
+ sum += kWarpedFilters[offset][k] * src_row[column];
+ }
+ intermediate_result[y + 7][x + 4] =
+ RightShiftWithRounding(sum, kRoundBitsHorizontal);
+ sx += alpha;
+ }
+ sx4 += beta;
+ }
+ } else {
+ // Region 4.
+ // Horizontal filter.
+ // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
+ // It follows that -6 <= iy4 <= source_height + 5. This inequality is
+ // used below.
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We assume the source frame has top and bottom borders of at least
+ // 13 pixels that extend the frame boundary pixels.
+ //
+ // Since -7 <= y <= 7, using the inequality on iy4 above, we have
+ // -13 <= iy4 + y <= source_height + 12,
+ // or
+ // -13 <= row <= (source_height - 1) + 13.
+ // Therefore we may over-read up to 13 pixels above the top source
+ // row, or up to 13 pixels below the bottom source row.
+ const int row = iy4 + y;
+ const Pixel* const src_row = src + row * source_stride;
+ int sx = sx4 - MultiplyBy4(alpha);
+ for (int x = -4; x < 4; ++x) {
+ const int offset =
+ RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ // Since alpha and beta have been validated by SetupShear(), one
+ // can prove that 0 <= offset <= 3 * 2^6.
+ assert(offset >= 0);
+ assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+ // For SIMD optimization:
+ // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+ // For 10/12 bit, the range of sum requires 32 bits.
+ int sum = first_pass_offset;
+ for (int k = 0; k < 8; ++k) {
+ // We assume the source frame has left and right borders of at
+ // least 13 pixels that extend the frame boundary pixels.
+ //
+ // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+ // ix4 above, we have
+ // -13 <= ix4 + x + k - 3 <= source_width + 12,
+ // or
+ // -13 <= column <= (source_width - 1) + 13.
+ // Therefore we may over-read up to 13 pixels before the source
+ // row, or up to 13 pixels after the source row.
+ const int column = ix4 + x + k - 3;
+ sum += kWarpedFilters[offset][k] * src_row[column];
+ }
+ intermediate_result[y + 7][x + 4] =
+ RightShiftWithRounding(sum, kRoundBitsHorizontal) -
+ offset_removal;
+ sx += alpha;
+ }
+ sx4 += beta;
+ }
+ }
+
+ // Regions 3 and 4.
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 =
+ (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ // The spec says we should use the following loop condition:
+ // y < std::min(4, block_start_y + block_height - start_y - 4);
+ // We can prove that block_start_y + block_height - start_y >= 8, which
+ // implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
+ // So the loop condition is simply y < 4.
+ //
+ // Proof:
+ // start_y < block_start_y + block_height
+ // => block_start_y + block_height - start_y > 0
+ // => block_height - (start_y - block_start_y) > 0
+ //
+ // Since block_height >= 8 and is a power of 2, it follows that
+ // block_height is a multiple of 8. start_y - block_start_y is also a
+ // multiple of 8. Therefore their difference is a multiple of 8. Since
+ // their difference is > 0, their difference must be >= 8.
+ //
+ // We then add an offset of 4 to y so that the loop starts with y = 0
+ // and continues if y < 8.
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ // The spec says we should use the following loop condition:
+ // x < std::min(4, block_start_x + block_width - start_x - 4);
+ // Similar to the above, we can prove that the loop condition can be
+ // simplified to x < 4.
+ //
+ // We then add an offset of 4 to x so that the loop starts with x = 0
+ // and continues if x < 8.
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ // Since gamma and delta have been validated by SetupShear(), one can
+ // prove that 0 <= offset <= 3 * 2^6.
+ assert(offset >= 0);
+ assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+ int sum = 0;
+ for (int k = 0; k < 8; ++k) {
+ sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x];
+ }
+ sum -= offset_removal;
+ sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+ if (is_compound) {
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dst_row[x] = static_cast<DestType>(sum);
+ } else {
+ dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+ }
+ sy += gamma;
+ }
+ dst_row += dest_stride;
+ sy4 += delta;
+ }
+ }
+ dst += 8 * dest_stride;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_Warp
+ dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_Warp
+ dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WarpCompound
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void WarpInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/warp.h b/src/dsp/warp.h
new file mode 100644
index 0000000..7367a9b
--- /dev/null
+++ b/src/dsp/warp.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WARP_H_
+#define LIBGAV1_SRC_DSP_WARP_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/warp_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/warp_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_WARP_H_
diff --git a/src/dsp/weight_mask.cc b/src/dsp/weight_mask.cc
new file mode 100644
index 0000000..15d6bc6
--- /dev/null
+++ b/src/dsp/weight_mask.cc
@@ -0,0 +1,227 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int width, int height, int bitdepth, bool mask_is_inverse>
+void WeightMask_C(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+ const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+ static_assert(width >= 8, "");
+ static_assert(height >= 8, "");
+ constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const int difference = RightShiftWithRounding(
+ std::abs(pred_0[x] - pred_1[x]), rounding_bits);
+ const auto mask_value =
+ static_cast<uint8_t>(std::min(DivideBy16(difference) + 38, 64));
+ mask[x] = mask_is_inverse ? 64 - mask_value : mask_value;
+ }
+ pred_0 += width;
+ pred_1 += width;
+ mask += mask_stride;
+ }
+}
+
+#define INIT_WEIGHT_MASK(width, height, bitdepth, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask_C<width, height, bitdepth, 0>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask_C<width, height, bitdepth, 1>
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+ INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+ INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+ INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+ INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+ INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+ INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+ INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+ INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+ INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+ INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+ INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+ INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+ INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+ INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+ INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+ INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+ INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+ INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+ INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+ INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+ INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+ INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+ INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+ INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+ INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+ INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+ INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+ INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+ INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+ INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+ INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+ INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+ INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+ INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+ INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+ INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+ INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+ INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+ INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+ INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+ INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+ INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+ INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+ INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+ INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+ INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+ INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+ INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+ INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+ INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+ INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+ INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+ INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+ INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+ INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+ INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+ INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+ INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+ INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+ INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+ INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+ INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+ INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+ INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+ INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+ INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void WeightMaskInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/weight_mask.h b/src/dsp/weight_mask.h
new file mode 100644
index 0000000..43bef05
--- /dev/null
+++ b/src/dsp/weight_mask.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+#define LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/weight_mask_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/weight_mask_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
new file mode 100644
index 0000000..8e008d1
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -0,0 +1,156 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline void AverageBlend4Row(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* dest) {
+ const __m128i pred_0 = LoadLo8(prediction_0);
+ const __m128i pred_1 = LoadLo8(prediction_1);
+ __m128i res = _mm_add_epi16(pred_0, pred_1);
+ res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+ Store4(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlend8Row(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* dest) {
+ const __m128i pred_0 = LoadAligned16(prediction_0);
+ const __m128i pred_1 = LoadAligned16(prediction_1);
+ __m128i res = _mm_add_epi16(pred_0, pred_1);
+ res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+ StoreLo8(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlendLargeRow(const int16_t* prediction_0,
+ const int16_t* prediction_1, const int width,
+ uint8_t* dest) {
+ int x = 0;
+ do {
+ const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
+ const __m128i pred_01 = LoadAligned16(&prediction_1[x]);
+ __m128i res0 = _mm_add_epi16(pred_00, pred_01);
+ res0 = RightShiftWithRounding_S16(res0, kInterPostRoundBit + 1);
+ const __m128i pred_10 = LoadAligned16(&prediction_0[x + 8]);
+ const __m128i pred_11 = LoadAligned16(&prediction_1[x + 8]);
+ __m128i res1 = _mm_add_epi16(pred_10, pred_11);
+ res1 = RightShiftWithRounding_S16(res1, kInterPostRoundBit + 1);
+ StoreUnaligned16(dest + x, _mm_packus_epi16(res0, res1));
+ x += 16;
+ } while (x < width);
+}
+
+void AverageBlend_SSE4_1(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = height;
+
+ if (width == 4) {
+ do {
+ // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
+ // to load 8 values at a time.
+ AverageBlend4Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlend4Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ AverageBlend8Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlend8Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(AverageBlend)
+ dsp->average_blend = AverageBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void AverageBlendInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h
new file mode 100644
index 0000000..937e8e2
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
new file mode 100644
index 0000000..3211a2d
--- /dev/null
+++ b/src/dsp/x86/cdef_sse4.cc
@@ -0,0 +1,728 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = {
+ 420, 210, 140, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16,
+ __m128i* partial_lo,
+ __m128i* partial_hi) {
+ // 00 01 02 03 04 05 06 07
+ *partial_lo = v_src_16[0];
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm_setzero_si128();
+
+ // 00 10 11 12 13 14 15 16
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2));
+ // 17 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4));
+ // 26 27 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16,
+ __m128i* partial_lo,
+ __m128i* partial_hi) {
+ __m128i v_d1_temp[8];
+ const __m128i v_zero = _mm_setzero_si128();
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = v_zero;
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
+ __m128i* partial_hi) {
+ __m128i v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]);
+ v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]);
+ v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]);
+ v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm_setzero_si128();
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
+ __m128i* partial_lo,
+ __m128i* partial_hi) {
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ __m128i v_src[8];
+ for (auto& i : v_src) {
+ i = LoadLo8(src);
+ src += stride;
+ }
+
+ const __m128i v_zero = _mm_setzero_si128();
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00
+ // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00
+ // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00
+ // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00
+ // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00
+ // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
+ // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
+ // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
+ const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]);
+ const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]);
+ const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]);
+ const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]);
+ const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero);
+ const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero);
+ const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero);
+ const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero);
+ const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+ const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+ partial_lo[2] =
+ _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+ _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+ __m128i v_src_16[8];
+ for (int i = 0; i < 8; ++i) {
+ v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]);
+ }
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00
+ // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00
+ // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00
+ // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00
+ // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00
+ // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
+ // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
+ // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
+ partial_lo[6] = v_src_16[0];
+ for (int i = 1; i < 8; ++i) {
+ partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]);
+ }
+
+ // partial for direction 0
+ AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]);
+
+ // partial for direction 1
+ AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]);
+
+ // partial for direction 7
+ AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]);
+
+ __m128i v_src_reverse[8];
+ const __m128i reverser =
+ _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+ for (int i = 0; i < 8; ++i) {
+ v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser);
+ }
+
+ // partial for direction 4
+ AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+ // partial for direction 3
+ AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+ // partial for direction 5
+ AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+inline uint32_t SumVector_S32(__m128i a) {
+ a = _mm_hadd_epi32(a, a);
+ a = _mm_add_epi32(a, _mm_srli_si128(a, 4));
+ return _mm_cvtsi128_si32(a);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
+ const __m128i division_table[2]) {
+ // Reverse and clear upper 2 bytes.
+ const __m128i reverser =
+ _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c);
+ // 14 13 12 11 10 09 08 ZZ
+ const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+ // 00 14 01 13 02 12 03 11
+ const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+ // 04 10 05 09 06 08 07 ZZ
+ const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+ const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+ const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+ const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+ return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+inline uint32_t CostOdd(const __m128i a, const __m128i b,
+ const __m128i division_table[2]) {
+ // Reverse and clear upper 10 bytes.
+ const __m128i reverser =
+ _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504);
+ // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+ const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+ // 00 10 01 09 02 08 03 ZZ
+ const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+ // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+ const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][10 - i])
+ const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+ const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+ const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+ const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+ return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+// Sum of squared elements.
+inline uint32_t SquareSum_S16(const __m128i a) {
+ const __m128i square = _mm_madd_epi16(a, a);
+ return SumVector_S32(square);
+}
+
+void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
+ uint8_t* const direction, int* const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t cost[8];
+ __m128i partial_lo[8], partial_hi[8];
+
+ AddPartial(src, stride, partial_lo, partial_hi);
+
+ cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
+ cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
+
+ const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable),
+ LoadUnaligned16(kCdefDivisionTable + 4)};
+
+ cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+ cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+ const __m128i division_table_odd[2] = {
+ LoadAligned16(kCdefDivisionTableOddPadded),
+ LoadAligned16(kCdefDivisionTableOddPadded + 4)};
+
+ cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
+ cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
+ cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd);
+ cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+ __m128i* output, const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+ output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+ output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+ output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+ __m128i* output, const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+ src - y_0 * stride + stride - x_0);
+ output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+ src + y_0 * stride + stride + x_0);
+ output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+ src - y_1 * stride + stride - x_1);
+ output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+ src + y_1 * stride + stride + x_1);
+}
+
+inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
+ const __m128i& damping, const __m128i& threshold) {
+ const __m128i diff = _mm_sub_epi16(pixel, reference);
+ const __m128i abs_diff = _mm_abs_epi16(diff);
+ // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+ // 0, std::abs(diff))
+ const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const __m128i thresh_minus_shifted_diff =
+ _mm_subs_epu16(threshold, shifted_diff);
+ const __m128i clamp_abs_diff =
+ _mm_min_epi16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return _mm_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
+ const __m128i& tap, const __m128i& damping,
+ const __m128i& threshold) {
+ const __m128i constrained = Constrain(val, pixel, damping, threshold);
+ return _mm_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_SSE4_1(const uint16_t* src, const ptrdiff_t src_stride,
+ const int height, const int primary_strength,
+ const int secondary_strength, const int damping,
+ const int direction, void* dest,
+ const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
+ primary_damping_shift =
+ _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+ }
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+ }
+
+ const __m128i primary_tap_0 =
+ _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]);
+ const __m128i primary_tap_1 =
+ _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
+ const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
+ const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
+ const __m128i cdef_large_value_mask =
+ _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
+ const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
+ const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength);
+
+ int y = height;
+ do {
+ __m128i pixel;
+ if (width == 8) {
+ pixel = LoadUnaligned16(src);
+ } else {
+ pixel = LoadHi8(LoadLo8(src), src + src_stride);
+ }
+
+ __m128i min = pixel;
+ __m128i max = pixel;
+ __m128i sum;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ __m128i primary_val[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val, direction);
+ }
+
+ if (clipping_required) {
+ min = _mm_min_epu16(min, primary_val[0]);
+ min = _mm_min_epu16(min, primary_val[1]);
+ min = _mm_min_epu16(min, primary_val[2]);
+ min = _mm_min_epu16(min, primary_val[3]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
+ const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
+ const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
+ max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
+ }
+
+ sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+ primary_damping_shift, primary_threshold);
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
+ primary_damping_shift, primary_threshold));
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ } else {
+ sum = _mm_setzero_si128();
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ __m128i secondary_val[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val, direction + 2);
+ LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+ }
+
+ if (clipping_required) {
+ min = _mm_min_epu16(min, secondary_val[0]);
+ min = _mm_min_epu16(min, secondary_val[1]);
+ min = _mm_min_epu16(min, secondary_val[2]);
+ min = _mm_min_epu16(min, secondary_val[3]);
+ min = _mm_min_epu16(min, secondary_val[4]);
+ min = _mm_min_epu16(min, secondary_val[5]);
+ min = _mm_min_epu16(min, secondary_val[6]);
+ min = _mm_min_epu16(min, secondary_val[7]);
+
+ const __m128i max_s01 =
+ _mm_max_epu8(secondary_val[0], secondary_val[1]);
+ const __m128i max_s23 =
+ _mm_max_epu8(secondary_val[2], secondary_val[3]);
+ const __m128i max_s45 =
+ _mm_max_epu8(secondary_val[4], secondary_val[5]);
+ const __m128i max_s67 =
+ _mm_max_epu8(secondary_val[6], secondary_val[7]);
+ const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
+ _mm_max_epu8(max_s45, max_s67));
+ max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
+ }
+
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ }
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+ // 8 + sum
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+ // (... - (sum < 0)) >> 4
+ sum = _mm_add_epi16(sum, sum_lt_0);
+ sum = _mm_srai_epi16(sum, 4);
+ // pixel + ...
+ sum = _mm_add_epi16(sum, pixel);
+ if (clipping_required) {
+ // Clip3
+ sum = _mm_min_epi16(sum, max);
+ sum = _mm_max_epi16(sum, min);
+ }
+
+ const __m128i result = _mm_packus_epi16(sum, sum);
+ if (width == 8) {
+ src += src_stride;
+ StoreLo8(dst, result);
+ dst += dst_stride;
+ --y;
+ } else {
+ src += src_stride << 1;
+ Store4(dst, result);
+ dst += dst_stride;
+ Store4(dst, _mm_srli_si128(result, 4));
+ dst += dst_stride;
+ y -= 2;
+ }
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_SSE4_1;
+ dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/cdef_sse4.h b/src/dsp/x86/cdef_sse4.h
new file mode 100644
index 0000000..6631eb7
--- /dev/null
+++ b/src/dsp/x86/cdef_sse4.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h
new file mode 100644
index 0000000..4ce7de2
--- /dev/null
+++ b/src/dsp/x86/common_avx2.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <immintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+ // For compatibility with older gcc toolchains (< 8) use
+ // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+ // are implemented similarly to the following, clang uses a different method
+ // but no differences in assembly have been observed.
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+ dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+ return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+ if (over_read_in_bytes > 0) {
+ __m128i m = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+ m = _mm_srli_si128(m, 1);
+ }
+ const __m256i mask = (over_read_in_bytes < 16)
+ ? SetrM128i(_mm_set1_epi8(-1), m)
+ : SetrM128i(m, _mm_setzero_si128());
+ dst = _mm256_and_si256(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i dst[2]) {
+ dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+ dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+ over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+ _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+ assert(bits <= 16);
+ const __m256i v_bias_d =
+ _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+ return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_AVX2
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h
new file mode 100644
index 0000000..c510f8c
--- /dev/null
+++ b/src/dsp/x86/common_sse4.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#if 0
+#include <cinttypes>
+#include <cstdio>
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintReg(const __m128i r, const char* const name, int size) {
+ int n;
+ union {
+ __m128i r;
+ uint8_t i8[16];
+ uint16_t i16[8];
+ uint32_t i32[4];
+ uint64_t i64[2];
+ } tmp;
+ tmp.r = r;
+ fprintf(stderr, "%s\t: ", name);
+ if (size == 8) {
+ for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+ } else if (size == 16) {
+ for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+ } else if (size == 32) {
+ for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+ } else {
+ for (n = 0; n < 2; ++n)
+ fprintf(stderr, "%.16" PRIx64 " ", static_cast<uint64_t>(tmp.i64[n]));
+ }
+ fprintf(stderr, "\n");
+}
+
+inline void PrintReg(const int r, const char* const name) {
+ fprintf(stderr, "%s: %d\n", name, r);
+}
+
+inline void PrintRegX(const int r, const char* const name) {
+ fprintf(stderr, "%s: %.8x\n", name, r);
+}
+
+#define PR(var, N) PrintReg(var, #var, N)
+#define PD(var) PrintReg(var, #var);
+#define PX(var) PrintRegX(var, #var);
+#endif // 0
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+ int16_t val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+ uint16_t val1;
+ uint16_t val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val1, val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+ return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+ const __m128 x =
+ _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+ return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+ return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m128i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes > 0) {
+ __m128i mask = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+ mask = _mm_srli_si128(mask, 1);
+ }
+ dst = _mm_and_si128(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+ _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+ _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+ _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+ assert(bits <= 16);
+ // Shift out all but the last bit.
+ const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+ // Avg with zero will shift by 1 and round.
+ return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+ assert(bits <= 16);
+ const __m128i v_bias_d =
+ _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+ static constexpr uint8_t kMask[32] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ };
+
+ return LoadUnaligned16(kMask + n);
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
new file mode 100644
index 0000000..3df2120
--- /dev/null
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -0,0 +1,534 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kHorizontalOffset = 3;
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
+ __m256i sum;
+ if (filter_index < 2) {
+ // 6 taps.
+ const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm256_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm256_add_epi16(sum, v_madd_65);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
+ const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm256_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int filter_index>
+__m256i SumHorizontalTaps(const __m256i* const src,
+ const __m256i* const v_tap) {
+ __m256i v_src[4];
+ const __m256i src_long = *src;
+ const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
+ const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ }
+ return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index>
+__m256i SimpleHorizontalTaps(const __m256i* const src,
+ const __m256i* const v_tap) {
+ __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm256_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+ const __m128i v_src_43 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+ const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ return v_sum_43;
+ }
+
+ // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+ const __m128i v_src_32 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+ // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+ const __m128i v_src_54 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x800f0f0e, 0x0e0d0d0c, 0x80070706, 0x06050504));
+ const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+ return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+// Filter 2xh sizes.
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int /*width*/, const int height,
+ const __m128i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ // Horizontal passes only need to account for |num_taps| 2 and 4 when
+ // |width| <= 4.
+ assert(num_taps <= 4);
+ if (num_taps <= 4) {
+ if (!is_compound) {
+ int y = 0;
+ do {
+ if (is_2d) {
+ const __m128i sum =
+ HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ Store4(&dest16[0], sum);
+ dest16 += pred_stride;
+ Store4(&dest16[0], _mm_srli_si128(sum, 8));
+ dest16 += pred_stride;
+ } else {
+ const __m128i sum =
+ SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ Store2(dest8, sum);
+ dest8 += pred_stride;
+ Store2(dest8, _mm_srli_si128(sum, 4));
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y += 2;
+ } while (y < height - 1);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ __m128i sum;
+ const __m128i input = LoadLo8(&src[2]);
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_43 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+ sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ } else {
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_32 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+ const __m128i v_madd_32 =
+ _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 =
+ _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_54, v_madd_32);
+ }
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+ Store4(dest16, sum);
+ }
+ }
+ }
+}
+
+// Filter widths >= 4.
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const __m256i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ if (width >= 32) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ if (is_2d || is_compound) {
+ // placeholder
+ } else {
+ // Load src used to calculate dest8[7:0] and dest8[23:16].
+ const __m256i src_long = LoadUnaligned32(&src[x]);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ // Load src used to calculate dest8[15:8] and dest8[31:24].
+ const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
+ const __m256i result2 =
+ SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ // Combine results and store.
+ StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
+ }
+ x += step * 4;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+ } else if (width == 16) {
+ int y = height;
+ do {
+ if (is_2d || is_compound) {
+ // placeholder
+ } else {
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
+ LoadUnaligned16(&src[src_stride]));
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i src_long2 = SetrM128i(
+ LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
+ const __m256i result2 =
+ SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
+ StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
+ StoreUnaligned16(&dest8[pred_stride],
+ _mm256_extracti128_si256(packed_result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ if (is_2d || is_compound) {
+ // placeholder
+ } else {
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
+ StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ } else { // width == 4
+ int y = height;
+ do {
+ if (is_2d || is_compound) {
+ // placeholder
+ } else {
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ Store4(&dest8[0], _mm256_castsi256_si128(result));
+ Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m128i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ }
+ } else if (num_taps == 6) {
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
+ v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ }
+ } else { // num_taps == 2
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ }
+ }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m256i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6
+ if (is_2d_vertical) {
+ // placeholder
+ }
+ } else if (num_taps == 6) {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5
+ if (is_2d_vertical) {
+ // placeholder
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ if (is_2d_vertical) {
+ // placeholder
+ }
+ } else { // num_taps == 2
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ if (is_2d_vertical) {
+ // placeholder
+ }
+ }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ assert(filter_id != 0);
+ __m128i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ assert(filter_id != 0);
+ __m256i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<8, 8, 2, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 8, 1, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 0) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 8, 0, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+void ConvolveHorizontal_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int /*vertical_filter_index*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width > 2) {
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+ } else {
+ // Use non avx2 version for smaller widths.
+ DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000..6179d98
--- /dev/null
+++ b/src/dsp/x86/convolve_avx2.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
new file mode 100644
index 0000000..3a0fff5
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -0,0 +1,2830 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+ __m128i sum;
+ if (filter_index < 2) {
+ // 6 taps.
+ const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm_add_epi16(sum, v_madd_65);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+ const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps(const uint8_t* const src,
+ const __m128i* const v_tap) {
+ __m128i v_src[4];
+ const __m128i src_long = LoadUnaligned16(src);
+ const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
+ const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ }
+ const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ return sum;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps(const uint8_t* const src,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16(const uint8_t* const src,
+ const __m128i* const v_tap) {
+ const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i input0 = LoadLo8(&src[2]);
+ const __m128i input1 = LoadLo8(&src[2 + src_stride]);
+
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i input0_dup =
+ _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3);
+ // 13 14 14 15 15 16 16 17 ....
+ const __m128i input1_dup =
+ _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3);
+ const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup);
+ const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ return v_sum_43;
+ }
+
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i input0_dup =
+ _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1);
+ // 12 13 13 14 14 15 15 16 16 17 ....
+ const __m128i input1_dup =
+ _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4);
+ // 14 15 15 16 16 17 17 18 ...
+ const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4);
+ const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup);
+ const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54);
+ const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+ return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const __m128i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ // 4 tap filters are never used when width > 4.
+ if (num_taps != 4 && width > 4) {
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ if (is_2d || is_compound) {
+ const __m128i v_sum =
+ HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+ if (is_2d) {
+ StoreAligned16(&dest16[x], v_sum);
+ } else {
+ StoreUnaligned16(&dest16[x], v_sum);
+ }
+ } else {
+ const __m128i result =
+ SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+ StoreLo8(&dest8[x], result);
+ }
+ x += step;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (++y < height);
+ return;
+ }
+
+ // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+ // |width| <= 4.
+ assert(width <= 4);
+ assert(num_taps <= 4);
+ if (num_taps <= 4) {
+ if (width == 4) {
+ int y = 0;
+ do {
+ if (is_2d || is_compound) {
+ const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+ StoreLo8(dest16, v_sum);
+ } else {
+ const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+ Store4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (++y < height);
+ return;
+ }
+
+ if (!is_compound) {
+ int y = 0;
+ do {
+ if (is_2d) {
+ const __m128i sum =
+ HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ Store4(&dest16[0], sum);
+ dest16 += pred_stride;
+ Store4(&dest16[0], _mm_srli_si128(sum, 8));
+ dest16 += pred_stride;
+ } else {
+ const __m128i sum =
+ SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ Store2(dest8, sum);
+ dest8 += pred_stride;
+ Store2(dest8, _mm_srli_si128(sum, 4));
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y += 2;
+ } while (y < height - 1);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ __m128i sum;
+ const __m128i input = LoadLo8(&src[2]);
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_43 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+ sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ } else {
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_32 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+ const __m128i v_madd_32 =
+ _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 =
+ _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_54, v_madd_32);
+ }
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+ Store4(dest16, sum);
+ }
+ }
+ }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m128i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ }
+ } else if (num_taps == 6) {
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
+ v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ }
+ } else { // num_taps == 2
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ }
+ }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+ const __m128i* const taps) {
+ __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m128i madd_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m128i madd_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ }
+ }
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m128i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned16(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadAligned16(src_x);
+ src_x += src_stride;
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16 + x + y * dst_stride, sum);
+ } else {
+ StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+ if (num_taps == 8) {
+ srcs[6] = LoadAligned16(src);
+ src += 8;
+ srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[num_taps] = LoadAligned16(src);
+ src += 8;
+ srcs[num_taps - 1] = _mm_unpacklo_epi64(
+ _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16, sum);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results = _mm_packus_epi16(sum, sum);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y += 2;
+ } while (y < height);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ if (num_taps == 8) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadAligned16(src);
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ } else if (num_taps == 4) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ } else if (num_taps == 6) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ } else if (num_taps == 8) {
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+ srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+ }
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const __m128i results = _mm_packus_epi16(sum, sum);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y += 4;
+ } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ assert(filter_id != 0);
+ __m128i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<8, 8, 2, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 8, 1, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 0) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 8, 0, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+void Convolve2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(16) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+ width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+ __m128i v_src[4];
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ return sum;
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int width, const int height,
+ const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 8);
+
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ __m128i srcs[8];
+ srcs[0] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadLo8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadLo8(src_x);
+ src_x += src_stride;
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16 + x + y * dst_stride, results);
+ } else {
+ const __m128i results =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ int y = 0;
+ do {
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ srcs[6] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ int y = 0;
+ do {
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ srcs[8] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ int y = 0;
+ do {
+ // 70 71 72 73
+ const __m128i d = Load4(src);
+ // 60 61 62 63 70 71 72 73
+ srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+ src += src_stride;
+ // 80 81 82 83
+ srcs[8] = Load4(src);
+ src += src_stride;
+ // 70 71 72 73 80 81 82 83
+ srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y += 2;
+ } while (y < height);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41
+ const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+ // This uses srcs[0]..srcs[1].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ // This uses srcs[0]..srcs[3].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+ int y = 0;
+ do {
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+ // This uses srcs[0]..srcs[5].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ int y = 0;
+ do {
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91 a0 a1
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+
+ // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+ // 60 61 70 71 80 81 90 91
+ srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+ // 70 71 80 81 90 91 a0 a1
+ srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+ // This uses srcs[0]..srcs[7].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ }
+}
+
+void ConvolveVertical_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else {
+ // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
+ // See convolve_neon.cc
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 2) {
+ FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
+void ConvolveCompoundCopy_SSE4(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const ptrdiff_t src_stride = reference_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ constexpr int kRoundBitsVertical =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+ if (width >= 16) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&src[x]);
+ const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_src_ext_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8));
+ const __m128i v_dest_lo =
+ _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
+ const __m128i v_dest_hi =
+ _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
+ // TODO(slavarnway): Investigate using aligned stores.
+ StoreUnaligned16(&dest[x], v_dest_lo);
+ StoreUnaligned16(&dest[x + 8], v_dest_hi);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const __m128i v_src = LoadLo8(&src[0]);
+ const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+ StoreUnaligned16(&dest[0], v_dest);
+ src += src_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else { /* width == 4 */
+ int y = height;
+ do {
+ const __m128i v_src0 = Load4(&src[0]);
+ const __m128i v_src1 = Load4(&src[src_stride]);
+ const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1);
+ const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+ StoreLo8(&dest[0], v_dest);
+ StoreHi8(&dest[pred_stride], v_dest);
+ src += src_stride * 2;
+ dest += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+void ConvolveCompoundVertical_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int vertical_filter_index,
+ const int /*horizontal_filter_id*/, const int vertical_filter_id,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 4) {
+ FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ }
+}
+
+void ConvolveHorizontal_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int /*vertical_filter_index*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index, const int /*vertical_filter_index*/,
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint16_t*>(prediction);
+
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
+ filter_index);
+}
+
+void ConvolveCompound2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ alignas(16) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ const ptrdiff_t dest_stride = width;
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+ // Filter 0
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+ {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+ {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+ {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+ // Filter 1
+ alignas(16) static constexpr int8_t
+ kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+ {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+ {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+ {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+ // Filter 2
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+ {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+ {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+ {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+ {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+ {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+ {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+ {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+ {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+ // Filter 3
+ alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+ {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+ {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+ // Filter 4
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+ {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+ // Filter 5
+ alignas(
+ 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+ {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+ switch (filter_index) {
+ case 0:
+ output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+ break;
+ case 1:
+ // The term "mixed" refers to the fact that the outer taps have a mix of
+ // negative and positive values.
+ output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+ break;
+ case 2:
+ output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+ output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+ output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+ break;
+ case 3:
+ output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+ break;
+ case 4:
+ output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+ break;
+ default:
+ assert(filter_index == 5);
+ output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+ output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+ output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+ output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+ break;
+ }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+ __m128i* const source /*[num_taps >> 1]*/) {
+ const __m128i src_vals = LoadUnaligned16(src);
+ source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+ if (grade_x == 1) {
+ if (num_taps > 2) {
+ source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+ }
+ if (num_taps > 4) {
+ source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+ }
+ if (num_taps > 6) {
+ source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+ }
+ } else {
+ assert(grade_x > 1);
+ assert(num_taps != 4);
+ // grade_x > 1 also means width >= 8 && num_taps != 4
+ const __m128i src_vals_ext = LoadLo8(src + 16);
+ if (num_taps > 2) {
+ source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+ src_indices);
+ source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+ src_indices);
+ }
+ if (num_taps > 6) {
+ source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+ src_indices);
+ }
+ }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+ const __m128i* filter_taps,
+ __m128i* out_taps) {
+ const __m128i scale_index_offsets =
+ _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+ const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+ const __m128i filter_indices =
+ _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+ filter_index_mask);
+ // Line up taps for maddubs_epi16.
+ // The unpack is also assumed to be lighter than shift+alignr.
+ for (int k = 0; k < (num_taps >> 1); ++k) {
+ const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+ const __m128i taps1 =
+ _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+ out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+ }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+ const __m128i src_indices16 =
+ _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+ const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+ return _mm_unpacklo_epi8(src_indices,
+ _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
+ int width, int subpixel_x, int step_x,
+ int intermediate_height,
+ int16_t* intermediate) {
+ // Account for the 0-taps that precede the 2 nonzero taps.
+ const int kernel_offset = (8 - num_taps) >> 1;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ __m128i filter_taps[num_taps];
+ GetHalfSubPixelFilter<filter_index>(filter_taps);
+ const __m128i index_steps =
+ _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+ _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+ __m128i taps[num_taps >> 1];
+ __m128i source[num_taps >> 1];
+ int p = subpixel_x;
+ // Case when width <= 4 is possible.
+ if (filter_index >= 3) {
+ if (filter_index > 3 || width <= 4) {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+ const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+ PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+ const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+ int y = intermediate_height;
+ do {
+ // Load and line up source values with the taps. Width 4 means no need
+ // to load extended source.
+ PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+ source);
+
+ StoreLo8(intermediate, RightShiftWithRounding_S16(
+ SumOnePassTaps<filter_index>(source, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+ return;
+ }
+ }
+
+ // |width| >= 8
+ int x = 0;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+ const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+ PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+ const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+ int y = intermediate_height;
+ do {
+ // For each x, a lane of src_k[k] contains src_x[k].
+ PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+ // Shift by one less because the taps are halved.
+ StoreAligned16(
+ intermediate_x,
+ RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+ // Avoid overreading the filter due to starting at kernel_offset.
+ // The only danger of overread is in the final filter, which has 4 taps.
+ const __m128i filter =
+ _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+ output[0] = _mm_shuffle_epi32(filter, 0);
+ if (num_taps > 2) {
+ output[1] = _mm_shuffle_epi32(filter, 0x55);
+ }
+ if (num_taps > 4) {
+ output[2] = _mm_shuffle_epi32(filter, 0xAA);
+ }
+ if (num_taps > 6) {
+ output[3] = _mm_shuffle_epi32(filter, 0xFF);
+ }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+ const __m128i* taps) {
+ const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+ const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+ if (num_taps > 2) {
+ const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+ const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+ }
+ if (num_taps > 4) {
+ const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+ const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+ }
+ if (num_taps > 6) {
+ const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+ const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+ }
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+ const __m128i* taps_hi) {
+ const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+ const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+ if (num_taps > 2) {
+ const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+ const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+ }
+ if (num_taps > 4) {
+ const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+ const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+ }
+ if (num_taps > 6) {
+ const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+ const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+#if LIBGAV1_MSAN
+__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
+#else
+inline void ConvolveVerticalScale(
+#endif
+ const int16_t* src, const int width, const int subpixel_y,
+ const int filter_index, const int step_y, const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ constexpr int kernel_offset = (8 - num_taps) / 2;
+ const int16_t* src_y = src;
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ auto* dest16_y = static_cast<uint16_t*>(dest);
+ auto* dest_y = static_cast<uint8_t*>(dest);
+ __m128i s[num_taps];
+
+ int p = subpixel_y & 1023;
+ int y = height;
+ if (width_class <= 4) {
+ __m128i filter_taps_lo[num_taps >> 1];
+ __m128i filter_taps_hi[num_taps >> 1];
+ do { // y > 0
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadLo8(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter0 =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadHi8(s[i], src_y + i * src_stride);
+ }
+ filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter1 =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+ const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+ s, filter_taps_lo, filter_taps_hi);
+ if (is_compound) {
+ assert(width_class > 2);
+ StoreLo8(dest16_y, sums);
+ dest16_y += dest_stride;
+ StoreHi8(dest16_y, sums);
+ dest16_y += dest_stride;
+ } else {
+ const __m128i result = _mm_packus_epi16(sums, sums);
+ if (width_class == 2) {
+ Store2(dest_y, result);
+ dest_y += dest_stride;
+ Store2(dest_y, _mm_srli_si128(result, 4));
+ } else {
+ Store4(dest_y, result);
+ dest_y += dest_stride;
+ Store4(dest_y, _mm_srli_si128(result, 4));
+ }
+ dest_y += dest_stride;
+ }
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ // |width_class| >= 8
+ __m128i filter_taps[num_taps >> 1];
+ do { // y > 0
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+ int x = 0;
+ do { // x < width
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadUnaligned16(src_y + i * src_stride);
+ }
+
+ const __m128i sums =
+ Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+ if (is_compound) {
+ StoreUnaligned16(dest16_y + x, sums);
+ } else {
+ StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums));
+ }
+ x += 8;
+ src_y += 8;
+ } while (x < width);
+ p += step_y;
+ dest_y += dest_stride;
+ dest16_y += dest_stride;
+ } while (--y != 0);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int subpixel_x, const int subpixel_y,
+ const int step_x, const int step_y, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ // TODO(petersonab): Reduce intermediate block stride to width to make smaller
+ // blocks faster.
+ alignas(16) int16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)];
+ const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ num_vert_taps;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [3, 5].
+ // Similarly for height.
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src += vert_kernel_offset * src_stride;
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // second register and alignr in order to gather all filter inputs.
+ // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+ const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+ switch (horiz_filter_index) {
+ case 0:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 1:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+
+ } else {
+ ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 2:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 3:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 4:
+ assert(width <= 4);
+ ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ break;
+ default:
+ assert(horiz_filter_index == 5);
+ assert(width <= 4);
+ ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+
+ // Vertical filter.
+ intermediate = intermediate_result;
+ switch (vert_filter_index) {
+ case 0:
+ case 1:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<6, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<6, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ case 2:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<8, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<8, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ case 3:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<2, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<2, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ default:
+ assert(vert_filter_index == 4 || vert_filter_index == 5);
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<4, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<4, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+}
+
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+ const __m128i left = LoadUnaligned16(src);
+ const __m128i right = LoadUnaligned16(src + 1);
+ StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+ int y = height;
+ do {
+ HalfAddHorizontal(src, dst);
+ if (width >= 32) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width >= 64) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width == 128) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+ const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const __m128i left = LoadLo8(src);
+ const __m128i right = LoadLo8(src + 1);
+ StoreLo8(dest, _mm_avg_epu8(left, right));
+
+ src += reference_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else if (width == 4) {
+ int y = height;
+ do {
+ __m128i left = Load4(src);
+ __m128i right = Load4(src + 1);
+ src += reference_stride;
+ left = _mm_unpacklo_epi32(left, Load4(src));
+ right = _mm_unpacklo_epi32(right, Load4(src + 1));
+ src += reference_stride;
+
+ const __m128i result = _mm_avg_epu8(left, right);
+
+ Store4(dest, result);
+ dest += pred_stride;
+ Store4(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ } else {
+ assert(width == 2);
+ __m128i left = _mm_setzero_si128();
+ __m128i right = _mm_setzero_si128();
+ int y = height;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<1>(src, left);
+ right = Load2<1>(src + 1, right);
+ src += reference_stride;
+
+ const __m128i result = _mm_avg_epu8(left, right);
+
+ Store2(dest, result);
+ dest += pred_stride;
+ Store2(dest, _mm_srli_si128(result, 2));
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+ __m128i row[8], below[8];
+
+ row[0] = LoadUnaligned16(src);
+ if (width >= 32) {
+ src += 16;
+ row[1] = LoadUnaligned16(src);
+ if (width >= 64) {
+ src += 16;
+ row[2] = LoadUnaligned16(src);
+ src += 16;
+ row[3] = LoadUnaligned16(src);
+ if (width == 128) {
+ src += 16;
+ row[4] = LoadUnaligned16(src);
+ src += 16;
+ row[5] = LoadUnaligned16(src);
+ src += 16;
+ row[6] = LoadUnaligned16(src);
+ src += 16;
+ row[7] = LoadUnaligned16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ below[0] = LoadUnaligned16(src);
+ if (width >= 32) {
+ src += 16;
+ below[1] = LoadUnaligned16(src);
+ if (width >= 64) {
+ src += 16;
+ below[2] = LoadUnaligned16(src);
+ src += 16;
+ below[3] = LoadUnaligned16(src);
+ if (width == 128) {
+ src += 16;
+ below[4] = LoadUnaligned16(src);
+ src += 16;
+ below[5] = LoadUnaligned16(src);
+ src += 16;
+ below[6] = LoadUnaligned16(src);
+ src += 16;
+ below[7] = LoadUnaligned16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+ row[0] = below[0];
+ if (width >= 32) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+ row[1] = below[1];
+ if (width >= 64) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+ row[2] = below[2];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+ row[3] = below[3];
+ if (width >= 128) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+ row[4] = below[4];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+ row[5] = below[5];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+ row[6] = below[6];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+ row[7] = below[7];
+ }
+ }
+ }
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ __m128i row, below;
+ row = LoadLo8(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = LoadLo8(src);
+ src += reference_stride;
+
+ StoreLo8(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else if (width == 4) {
+ __m128i row = Load4(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ __m128i below = Load4(src);
+ src += reference_stride;
+
+ Store4(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else {
+ assert(width == 2);
+ __m128i row = Load2(src);
+ __m128i below = _mm_setzero_si128();
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = Load2<0>(src, below);
+ src += reference_stride;
+
+ Store2(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+ const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+ return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+ const __m128i a = _mm_add_epi16(v0, v1);
+ const __m128i b = _mm_srli_epi16(a, 1);
+ // Use avg here to shift right by 1 with round.
+ const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+ return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ __m128i row[16];
+ row[0] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 16) {
+ src += 8;
+ row[1] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 32) {
+ src += 8;
+ row[2] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[3] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 64) {
+ src += 8;
+ row[4] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[5] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[6] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[7] = LoadU8AndAddLong(src, src + 1);
+ if (width == 128) {
+ src += 8;
+ row[8] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[9] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[10] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[11] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[12] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[13] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[14] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[15] = LoadU8AndAddLong(src, src + 1);
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+ row[0] = below_0;
+ if (width >= 16) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+ row[1] = below_1;
+ if (width >= 32) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+ row[2] = below_2;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+ row[3] = below_3;
+ if (width >= 64) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+ row[4] = below_4;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+ row[5] = below_5;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+ row[6] = below_6;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+ row[7] = below_7;
+ if (width == 128) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+ row[8] = below_8;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+ row[9] = below_9;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+ row[10] = below_10;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+ row[11] = below_11;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+ row[12] = below_12;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+ row[13] = below_13;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+ row[14] = below_14;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+ row[15] = below_15;
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+
+ if (width == 128) {
+ IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 8) {
+ IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 4) {
+ __m128i left = _mm_cvtepu8_epi16(Load4(src));
+ __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+ src += reference_stride;
+
+ __m128i row = _mm_add_epi16(left, right);
+
+ int y = height;
+ do {
+ left = Load4(src);
+ right = Load4(src + 1);
+ src += reference_stride;
+ left = _mm_unpacklo_epi32(left, Load4(src));
+ right = _mm_unpacklo_epi32(right, Load4(src + 1));
+ src += reference_stride;
+
+ const __m128i below =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+ const __m128i result =
+ AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+ Store4(dest, result);
+ dest += pred_stride;
+ Store4(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+
+ row = _mm_srli_si128(below, 8);
+ y -= 2;
+ } while (y != 0);
+ } else {
+ __m128i left = Load2(src);
+ __m128i right = Load2(src + 1);
+ src += reference_stride;
+
+ __m128i row =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+ int y = height;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<2>(src, left);
+ right = Load2<2>(src + 1, right);
+ src += reference_stride;
+
+ const __m128i below =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+ const __m128i result =
+ AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+ Store2(dest, result);
+ dest += pred_stride;
+ Store2(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+
+ row = _mm_srli_si128(below, 8);
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+ dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+ dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+ dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+ dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/convolve_sse4.h b/src/dsp/x86/convolve_sse4.h
new file mode 100644
index 0000000..d6c3155
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
new file mode 100644
index 0000000..deb57ef
--- /dev/null
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -0,0 +1,230 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+ const __m128i& pred1,
+ const __m128i& weights) {
+ // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+ const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1);
+ const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights);
+ const __m128i result_lo =
+ RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4);
+
+ const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1);
+ const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights);
+ const __m128i result_hi =
+ RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4);
+
+ return _mm_packs_epi32(result_lo, result_hi);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+ const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+ for (int y = 0; y < height; y += 4) {
+ // TODO(b/150326556): Use larger loads.
+ const __m128i src_00 = LoadLo8(pred_0);
+ const __m128i src_10 = LoadLo8(pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ __m128i src_0 = LoadHi8(src_00, pred_0);
+ __m128i src_1 = LoadHi8(src_10, pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights);
+
+ const __m128i src_01 = LoadLo8(pred_0);
+ const __m128i src_11 = LoadLo8(pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ src_0 = LoadHi8(src_01, pred_0);
+ src_1 = LoadHi8(src_11, pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights);
+
+ const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+ Store4(dst, result_pixels);
+ dst += dest_stride;
+ const int result_1 = _mm_extract_epi32(result_pixels, 1);
+ memcpy(dst, &result_1, sizeof(result_1));
+ dst += dest_stride;
+ const int result_2 = _mm_extract_epi32(result_pixels, 2);
+ memcpy(dst, &result_2, sizeof(result_2));
+ dst += dest_stride;
+ const int result_3 = _mm_extract_epi32(result_pixels, 3);
+ memcpy(dst, &result_3, sizeof(result_3));
+ dst += dest_stride;
+ }
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+ const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+ for (int y = 0; y < height; y += 2) {
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+ const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+ StoreLo8(dst, result_pixels);
+ dst += dest_stride;
+ StoreHi8(dst, result_pixels);
+ dst += dest_stride;
+ }
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+ const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+ const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+ const __m128i res_lo =
+ ComputeWeightedAverage8(src_0_lo, src_1_lo, weights);
+
+ const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+ const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+ const __m128i res_hi =
+ ComputeWeightedAverage8(src_0_hi, src_1_hi, weights);
+
+ StoreUnaligned16(dst + x, _mm_packus_epi16(res_lo, res_hi));
+ x += 16;
+ } while (x < width);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0,
+ const uint8_t weight_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ if (width == 4) {
+ if (height == 4) {
+ DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ } else if (height == 8) {
+ DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ } else {
+ assert(height == 16);
+ DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ }
+ return;
+ }
+
+ if (width == 8) {
+ switch (height) {
+ case 4:
+ DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ return;
+ case 8:
+ DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ return;
+ case 16:
+ DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ return;
+ default:
+ assert(height == 32);
+ DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+
+ return;
+ }
+ }
+
+ DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+ height, dest, dest_stride);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(DistanceWeightedBlend)
+ dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h
new file mode 100644
index 0000000..8646eca
--- /dev/null
+++ b/src/dsp/x86/distance_weighted_blend_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc
new file mode 100644
index 0000000..4a8658d
--- /dev/null
+++ b/src/dsp/x86/intra_edge_sse4.cc
@@ -0,0 +1,270 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring> // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+ {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxEdgeBufferSize = 129;
+
+// This function applies the kernel [0, 4, 8, 4, 0] to 12 values.
+// Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to
+// write as overlapping sets of 8-bytes.
+inline void ComputeKernel1Store12(uint8_t* dest, const uint8_t* source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+ // Samples matched with the '4' tap, expanded to 16-bit.
+ const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+ const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+ // Samples matched with the '8' tap, expanded to 16-bit.
+ const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+ const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+
+ // Apply the taps by shifting.
+ const __m128i outers4_lo = _mm_slli_epi16(outers_lo, 2);
+ const __m128i outers4_hi = _mm_slli_epi16(outers_hi, 2);
+ const __m128i centers8_lo = _mm_slli_epi16(centers_lo, 3);
+ const __m128i centers8_hi = _mm_slli_epi16(centers_hi, 3);
+ // Move latter 4x values down to add with first 4x values for each output.
+ const __m128i partial_sums_lo =
+ _mm_add_epi16(outers4_lo, _mm_srli_si128(outers4_lo, 4));
+ const __m128i partial_sums_hi =
+ _mm_add_epi16(outers4_hi, _mm_srli_si128(outers4_hi, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_lo = RightShiftWithRounding_U16(
+ _mm_add_epi16(partial_sums_lo, centers8_lo), 4);
+ const __m128i sums_hi = RightShiftWithRounding_U16(
+ _mm_add_epi16(partial_sums_hi, centers8_hi), 4);
+
+ const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+ const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+ const __m128i result =
+ _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+ StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [0, 5, 6, 5, 0] to 12 values.
+// Assumes |edge| has 8 packed byte values, and that the 2 invalid values will
+// be overwritten or safely discarded.
+inline void ComputeKernel2Store12(uint8_t* dest, const uint8_t* source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+ const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+ const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+ const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+ const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+ // Samples matched with the '5' tap, expanded to 16-bit. Add x + 4x.
+ const __m128i outers5_lo =
+ _mm_add_epi16(outers_lo, _mm_slli_epi16(outers_lo, 2));
+ const __m128i outers5_hi =
+ _mm_add_epi16(outers_hi, _mm_slli_epi16(outers_hi, 2));
+ // Samples matched with the '6' tap, expanded to 16-bit. Add 2x + 4x.
+ const __m128i centers6_lo = _mm_add_epi16(_mm_slli_epi16(centers_lo, 1),
+ _mm_slli_epi16(centers_lo, 2));
+ const __m128i centers6_hi = _mm_add_epi16(_mm_slli_epi16(centers_hi, 1),
+ _mm_slli_epi16(centers_hi, 2));
+ // Move latter 5x values down to add with first 5x values for each output.
+ const __m128i partial_sums_lo =
+ _mm_add_epi16(outers5_lo, _mm_srli_si128(outers5_lo, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_lo = RightShiftWithRounding_U16(
+ _mm_add_epi16(centers6_lo, partial_sums_lo), 4);
+ // Shift latter 5x values to add with first 5x values for each output.
+ const __m128i partial_sums_hi =
+ _mm_add_epi16(outers5_hi, _mm_srli_si128(outers5_hi, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_hi = RightShiftWithRounding_U16(
+ _mm_add_epi16(centers6_hi, partial_sums_hi), 4);
+ // First 6 values are valid outputs.
+ const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+ const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+ const __m128i result =
+ _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+ StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [2, 4, 4, 4, 2] to 8 values.
+inline void ComputeKernel3Store8(uint8_t* dest, const uint8_t* source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 4);
+ // Finish |edge_lo| life cycle quickly.
+ // Multiply for 2x.
+ const __m128i source2_lo = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_lo), 1);
+ // Multiply 2x by 2 and align.
+ const __m128i source4_lo = _mm_srli_si128(_mm_slli_epi16(source2_lo, 1), 2);
+ // Finish |source2| life cycle quickly.
+ // Move latter 2x values down to add with first 2x values for each output.
+ __m128i sum = _mm_add_epi16(source2_lo, _mm_srli_si128(source2_lo, 8));
+ // First 4x values already aligned to add with running total.
+ sum = _mm_add_epi16(sum, source4_lo);
+ // Move second 4x values down to add with running total.
+ sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 2));
+ // Move third 4x values down to add with running total.
+ sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 4));
+ // Multiply for 2x.
+ const __m128i source2_hi = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_hi), 1);
+ // Multiply 2x by 2 and align.
+ const __m128i source4_hi = _mm_srli_si128(_mm_slli_epi16(source2_hi, 1), 2);
+ // Move latter 2x values down to add with first 2x values for each output.
+ __m128i sum_hi = _mm_add_epi16(source2_hi, _mm_srli_si128(source2_hi, 8));
+ // First 4x values already aligned to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, source4_hi);
+ // Move second 4x values down to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 2));
+ // Move third 4x values down to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 4));
+
+ // Because we have only 8 values here, it is safe to align before packing down
+ // to 8-bit without losing data.
+ sum = _mm_alignr_epi8(sum_hi, _mm_slli_si128(sum, 8), 8);
+ sum = RightShiftWithRounding_U16(sum, 4);
+ StoreLo8(dest, _mm_packus_epi16(sum, sum));
+}
+
+void IntraEdgeFilter_SSE4_1(void* buffer, int size, int strength) {
+ uint8_t edge[kMaxEdgeBufferSize + 4];
+ memcpy(edge, buffer, size);
+ auto* dst_buffer = static_cast<uint8_t*>(buffer);
+
+ // Only process |size| - 1 elements. Nothing to do in this case.
+ if (size == 1) return;
+
+ int i = 0;
+ switch (strength) {
+ case 1:
+ // To avoid overwriting, we stop short from the total write size plus the
+ // initial offset. In this case 12 valid values are written in two blocks
+ // of 8 bytes each.
+ for (; i < size - 17; i += 12) {
+ ComputeKernel1Store12(dst_buffer + i + 1, edge + i);
+ }
+ break;
+ case 2:
+ // See the comment for case 1.
+ for (; i < size - 17; i += 12) {
+ ComputeKernel2Store12(dst_buffer + i + 1, edge + i);
+ }
+ break;
+ default:
+ assert(strength == 3);
+ // The first filter input is repeated for taps of value 2 and 4.
+ dst_buffer[1] = RightShiftWithRounding(
+ (6 * edge[0] + 4 * edge[1] + 4 * edge[2] + 2 * edge[3]), 4);
+ // In this case, one block of 8 bytes is written in each iteration, with
+ // an offset of 2.
+ for (; i < size - 10; i += 8) {
+ ComputeKernel3Store8(dst_buffer + i + 2, edge + i);
+ }
+ }
+ const int kernel_index = strength - 1;
+ for (int final_index = Clip3(i, 1, size - 2); final_index < size;
+ ++final_index) {
+ int sum = 0;
+ for (int j = 0; j < kKernelTaps; ++j) {
+ const int k = Clip3(final_index + j - 2, 0, size - 1);
+ sum += kKernels[kernel_index][j] * edge[k];
+ }
+ dst_buffer[final_index] = RightShiftWithRounding(sum, 4);
+ }
+}
+
+constexpr int kMaxUpsampleSize = 16;
+
+// Applies the upsampling kernel [-1, 9, 9, -1] to alternating pixels, and
+// interleaves the results with the original values. This implementation assumes
+// that it is safe to write the maximum number of upsampled pixels (32) to the
+// edge buffer, even when |size| is small.
+void IntraEdgeUpsampler_SSE4_1(void* buffer, int size) {
+ assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+ auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+ uint8_t temp[kMaxUpsampleSize + 8];
+ temp[0] = temp[1] = pixel_buffer[-1];
+ memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+ temp[size + 2] = pixel_buffer[size - 1];
+
+ pixel_buffer[-2] = temp[0];
+ const __m128i data = LoadUnaligned16(temp);
+ const __m128i src_lo = _mm_cvtepu8_epi16(data);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, _mm_setzero_si128());
+ const __m128i src9_hi = _mm_add_epi16(src_hi, _mm_slli_epi16(src_hi, 3));
+ const __m128i src9_lo = _mm_add_epi16(src_lo, _mm_slli_epi16(src_lo, 3));
+ __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo);
+ sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4));
+ sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6));
+ sum_lo = RightShiftWithRounding_S16(sum_lo, 4);
+ const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo),
+ _mm_srli_si128(data, 2));
+ StoreUnaligned16(pixel_buffer - 1, result_lo);
+ if (size > 8) {
+ const __m128i src_hi_extra = _mm_cvtepu8_epi16(LoadLo8(temp + 16));
+ const __m128i src9_hi_extra =
+ _mm_add_epi16(src_hi_extra, _mm_slli_epi16(src_hi_extra, 3));
+ __m128i sum_hi =
+ _mm_sub_epi16(_mm_alignr_epi8(src9_hi_extra, src9_hi, 2), src_hi);
+ sum_hi = _mm_add_epi16(sum_hi, _mm_alignr_epi8(src9_hi_extra, src9_hi, 4));
+ sum_hi = _mm_sub_epi16(sum_hi, _mm_alignr_epi8(src_hi_extra, src_hi, 6));
+ sum_hi = RightShiftWithRounding_S16(sum_hi, 4);
+ const __m128i result_hi =
+ _mm_unpacklo_epi8(_mm_packus_epi16(sum_hi, sum_hi), LoadLo8(temp + 10));
+ StoreUnaligned16(pixel_buffer + 15, result_hi);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeFilter)
+ dsp->intra_edge_filter = IntraEdgeFilter_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeUpsampler)
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void IntraEdgeInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intra_edge_sse4.h b/src/dsp/x86/intra_edge_sse4.h
new file mode 100644
index 0000000..6ed4d40
--- /dev/null
+++ b/src/dsp/x86/intra_edge_sse4.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
new file mode 100644
index 0000000..fac1556
--- /dev/null
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -0,0 +1,976 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ __m128i ac_q3 = LoadUnaligned16(input);
+ __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+template <int width, int height>
+void CflIntraPredictor_SSE4_1(
+ void* const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i alpha_sign = _mm_set1_epi16(alpha);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ auto* row = reinterpret_cast<const __m128i*>(luma);
+ const int kCflLumaBufferStrideLog2_16i = 5;
+ const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3;
+ const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+ const __m128i dc_val = _mm_set1_epi16(dst[0]);
+ do {
+ __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+ if (width < 16) {
+ res = _mm_packus_epi16(res, res);
+ if (width == 4) {
+ Store4(dst, res);
+ } else {
+ StoreLo8(dst, res);
+ }
+ } else {
+ __m128i next =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ res = _mm_packus_epi16(res, next);
+ StoreUnaligned16(dst, res);
+ if (width == 32) {
+ res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+ next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+ res = _mm_packus_epi16(res, next);
+ StoreUnaligned16(dst + 16, res);
+ }
+ }
+ dst += stride;
+ } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint8_t*>(source);
+ __m128i sum = _mm_setzero_si128();
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i samples;
+ int y = 0;
+ do {
+ samples = Load4(src);
+ src += stride;
+ int src_bytes;
+ memcpy(&src_bytes, src, 4);
+ samples = _mm_insert_epi32(samples, src_bytes, 1);
+ src += stride;
+ samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3);
+ StoreLo8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+ StoreHi8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+
+ // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for
+ // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits.
+ sum = _mm_add_epi16(sum, samples);
+ y += 2;
+ } while (y < visible_height);
+
+ if (!is_inside) {
+ int y = visible_height;
+ do {
+ StoreHi8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+ sum = _mm_add_epi16(sum, samples);
+ ++y;
+ } while (y < block_height);
+ }
+
+ __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero);
+ sum = _mm_cvtepu16_epi32(sum);
+ sum = _mm_add_epi32(sum, sum_tmp);
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ sum, block_height_log2 + 2 /* log2 of width 4 */);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 4;
+
+ if (block_height <= max_luma_height && block_width <= max_luma_width) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2, bool inside>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 5, "");
+ const int block_height = 1 << block_height_log2, block_width = 8;
+ const int visible_height = max_luma_height;
+ const int invisible_width = inside ? 0 : block_width - max_luma_width;
+ const int visible_width = max_luma_width;
+ const __m128i blend_mask =
+ inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width);
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ // Since the maximum height is 32, if we split them by parity, each one only
+ // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can
+ // store them in 16 bits without casting to 32 bits.
+ __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128();
+ __m128i sum;
+ __m128i samples1;
+
+ int y = 0;
+ do {
+ __m128i samples0 = LoadLo8(src);
+ if (!inside) {
+ const __m128i border0 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+ samples0 = _mm_blendv_epi8(samples0, border0, blend_mask);
+ }
+ src += stride;
+ samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3);
+ StoreUnaligned16(luma_ptr, samples0);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_even = _mm_add_epi16(sum_even, samples0);
+
+ samples1 = LoadLo8(src);
+ if (!inside) {
+ const __m128i border1 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+ samples1 = _mm_blendv_epi8(samples1, border1, blend_mask);
+ }
+ src += stride;
+ samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_odd = _mm_add_epi16(sum_odd, samples1);
+ y += 2;
+ } while (y < visible_height);
+
+ if (!inside) {
+ for (int y = visible_height; y < block_height; y += 2) {
+ sum_even = _mm_add_epi16(sum_even, samples1);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_odd = _mm_add_epi16(sum_odd, samples1);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+ }
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero),
+ _mm_cvtepu16_epi32(sum_even));
+ sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero));
+ sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd));
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ sum, block_height_log2 + 3 /* log2 of width 8 */);
+ averages = _mm_shuffle_epi8(averages, dup16);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+// This function will only work for block_width 16 and 32.
+template <int block_width_log2, int block_height_log2, bool inside>
+void CflSubsampler444_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 1 << block_width_log2;
+
+ const int visible_height = max_luma_height;
+ const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width);
+ const int invisible_width_16 = 16 - visible_width_16;
+ const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16);
+ const int visible_width_32 = inside ? 32 : max_luma_width;
+ const int invisible_width_32 = 32 - visible_width_32;
+ const __m128i blend_mask_32 =
+ MaskHighNBytes(std::min(16, invisible_width_32));
+
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const __m128i zero = _mm_setzero_si128();
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ __m128i sum = _mm_setzero_si128();
+
+ __m128i samples0, samples1;
+ __m128i samples2, samples3;
+ __m128i inner_sum_lo, inner_sum_hi;
+ int y = 0;
+ do {
+#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
+ // then masked off by blendv, MSAN isn't smart enough to
+ // understand that. So we switch to a C implementation here.
+ uint16_t c_arr[16];
+ for (int x = 0; x < 16; x++) {
+ const int x_index = std::min(x, visible_width_16 - 1);
+ c_arr[x] = src[x_index] << 3;
+ }
+ samples0 = LoadUnaligned16(c_arr);
+ samples1 = LoadUnaligned16(c_arr + 8);
+ static_cast<void>(blend_mask_16);
+#else
+ __m128i samples01 = LoadUnaligned16(src);
+
+ if (!inside) {
+ const __m128i border16 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width_16 - 1]));
+ samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16);
+ }
+ samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
+ samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
+#endif // LIBGAV1_MSAN
+
+ StoreUnaligned16(luma_ptr, samples0);
+ StoreUnaligned16(luma_ptr + 8, samples1);
+ __m128i inner_sum = _mm_add_epi16(samples0, samples1);
+
+ if (block_width == 32) {
+#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
+ // then masked off by blendv, MSAN isn't smart enough to
+ // understand that. So we switch to a C implementation here.
+ uint16_t c_arr[16];
+ for (int x = 16; x < 32; x++) {
+ const int x_index = std::min(x, visible_width_32 - 1);
+ c_arr[x - 16] = src[x_index] << 3;
+ }
+ samples2 = LoadUnaligned16(c_arr);
+ samples3 = LoadUnaligned16(c_arr + 8);
+ static_cast<void>(blend_mask_32);
+#else
+ __m128i samples23 = LoadUnaligned16(src + 16);
+ if (!inside) {
+ const __m128i border32 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
+ samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32);
+ }
+ samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
+ samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
+#endif // LIBGAV1_MSAN
+
+ StoreUnaligned16(luma_ptr + 16, samples2);
+ StoreUnaligned16(luma_ptr + 24, samples3);
+ inner_sum = _mm_add_epi16(samples2, inner_sum);
+ inner_sum = _mm_add_epi16(samples3, inner_sum);
+ }
+
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ luma_ptr += kCflLumaBufferStride;
+ src += stride;
+ } while (++y < visible_height);
+
+ if (!inside) {
+ for (int y = visible_height; y < block_height;
+ luma_ptr += kCflLumaBufferStride, ++y) {
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ StoreUnaligned16(luma_ptr, samples0);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ StoreUnaligned16(luma_ptr + 8, samples1);
+ if (block_width == 32) {
+ StoreUnaligned16(luma_ptr + 16, samples2);
+ StoreUnaligned16(luma_ptr + 24, samples3);
+ }
+ }
+ }
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages =
+ RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2);
+ averages = _mm_shuffle_epi8(averages, dup16);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ for (int x = 0; x < block_width; x += 8) {
+ __m128i samples = LoadUnaligned16(&luma_ptr[x]);
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages));
+ }
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 1 << block_width_log2;
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreLo8(luma_ptr, result);
+ StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+ return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreUnaligned16(luma_ptr, result);
+ return result;
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = 0;
+ do {
+ // Note that double sampling and converting to 16bit makes a row fill the
+ // vector.
+ const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+ const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+ __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+ const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y += 4;
+ } while (y < luma_height);
+ const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+ for (; y < block_height; ++y) {
+ StoreLo8(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+ return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+ const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+ return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint8_t*>(source);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = 0;
+
+ do {
+ const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row01 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row00);
+ src += stride;
+ const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row11 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row10);
+ src += stride;
+ const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+ __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row21 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row20);
+ src += stride;
+ const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row31 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row30);
+ src += stride;
+ const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+ const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row41 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row40);
+ src += stride;
+ const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row51 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row50);
+ src += stride;
+ const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+ const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row61 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row60);
+ src += stride;
+ const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row71 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row70);
+ src += stride;
+ const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+ const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y += 4;
+ } while (y < luma_height);
+ // Duplicate the final row downward to the end after max_luma_height.
+ const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+ const __m128i final_fill_to_sum1 =
+ _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+ for (; y < block_height; ++y) {
+ StoreUnaligned16(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint8_t*>(source);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+
+ int16_t* luma_ptr = luma[0];
+ __m128i final_row_result;
+ // Begin first y section, covering width up to 16.
+ int y = 0;
+ do {
+ const uint8_t* src_next = src + stride;
+ const __m128i samples_row0_lo = LoadUnaligned16(src);
+ const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo);
+ const __m128i samples_row01 = (max_luma_width >= 16)
+ ? _mm_unpackhi_epi8(samples_row0_lo, zero)
+ : LastRowSamples(samples_row00);
+ const __m128i samples_row0_hi = LoadUnaligned16(src + 16);
+ const __m128i samples_row02 = (max_luma_width >= 24)
+ ? _mm_cvtepu8_epi16(samples_row0_hi)
+ : LastRowSamples(samples_row01);
+ const __m128i samples_row03 = (max_luma_width == 32)
+ ? _mm_unpackhi_epi8(samples_row0_hi, zero)
+ : LastRowSamples(samples_row02);
+ const __m128i samples_row1_lo = LoadUnaligned16(src_next);
+ const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo);
+ const __m128i samples_row11 = (max_luma_width >= 16)
+ ? _mm_unpackhi_epi8(samples_row1_lo, zero)
+ : LastRowSamples(samples_row10);
+ const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16);
+ const __m128i samples_row12 = (max_luma_width >= 24)
+ ? _mm_cvtepu8_epi16(samples_row1_hi)
+ : LastRowSamples(samples_row11);
+ const __m128i samples_row13 = (max_luma_width == 32)
+ ? _mm_unpackhi_epi8(samples_row1_hi, zero)
+ : LastRowSamples(samples_row12);
+ const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+ const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+ const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+ __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_row_result =
+ StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ sum = _mm_add_epi16(sum, final_row_result);
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ src += stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < luma_height);
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ // Multiply duplicated value by number of occurrences, height * 4, since
+ // there are 16 in each row and the value appears in the vector 4 times.
+ final_sum = _mm_add_epi32(
+ final_sum,
+ _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2));
+ }
+
+ // Begin second y section.
+ if (y < block_height) {
+ const __m128i final_fill0 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill1 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+ const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+ const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+ do {
+ StoreUnaligned16(luma_ptr, final_fill0);
+ StoreUnaligned16(luma_ptr + 8, final_fill1);
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_width_log2 + block_height_log2);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples0 = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+ const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+ final_row_result = _mm_sub_epi16(samples1, averages);
+ StoreUnaligned16(luma_ptr + 8, final_row_result);
+ }
+ if (block_width_log2 == 5) {
+ int16_t* wide_luma_ptr = luma[0] + 16;
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ for (int i = 0; i < block_height;
+ ++i, wide_luma_ptr += kCflLumaBufferStride) {
+ StoreUnaligned16(wide_luma_ptr, wide_fill);
+ StoreUnaligned16(wide_luma_ptr + 8, wide_fill);
+ }
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_SSE4_1<32, 32>;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
new file mode 100644
index 0000000..e944ea3
--- /dev/null
+++ b/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -0,0 +1,2662 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring> // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+ // block dimension = 4
+ 255, 149, 85, 64,
+ // block dimension = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // block dimension = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // block dimension = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // block dimension = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+ 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+ 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+template <int y_mask>
+inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left,
+ const __m128i& weights,
+ const __m128i& scaled_top_right,
+ const __m128i& round) {
+ const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights);
+ const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+template <int y_mask>
+inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights,
+ const __m128i& scaled_bottom_left) {
+ const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask);
+ const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+ return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y);
+}
+
+template <int y_mask>
+inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top,
+ const __m128i& weights,
+ const __m128i& scaled_bottom_left,
+ const __m128i& round) {
+ __m128i pred_sum =
+ SmoothVerticalSum4<y_mask>(top, weights, scaled_bottom_left);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8));
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
+ const __m128i& weights,
+ const __m128i& scaled_corner) {
+ const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+ return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels,
+ const __m128i& weights,
+ const __m128i& scaled_corner,
+ const __m128i& round) {
+ const __m128i pred_sum =
+ SmoothDirectionalSum8(pixels, weights, scaled_corner);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8);
+ StoreLo8(dest, _mm_packus_epi16(pred, pred));
+}
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1,
+ const __m128i& pixels2,
+ const __m128i& weights1,
+ const __m128i& weights2,
+ const __m128i& scaled_corner1,
+ const __m128i& scaled_corner2,
+ const __m128i& round) {
+ const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+ const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+ const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+ const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+ const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+ StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2));
+}
+
+template <int y_mask>
+inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top,
+ const __m128i& left, const __m128i& weights_x,
+ const __m128i& weights_y,
+ const __m128i& scaled_bottom_left,
+ const __m128i& scaled_top_right,
+ const __m128i& round) {
+ const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x);
+ const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask);
+ const __m128i weighted_top = _mm_mullo_epi32(weight_y, top);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+ const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y);
+ const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top);
+ const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred);
+
+ // Equivalent to RightShiftWithRounding(pred[x][y], 9).
+ const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9);
+
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left,
+ const int height, __m128i* pixels) {
+ if (height == 4) {
+ pixels[1] = Load4(left);
+ } else if (height == 8) {
+ pixels[1] = LoadLo8(left);
+ } else {
+ pixels[1] = LoadUnaligned16(left);
+ }
+
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ const __m128i top = _mm_cvtepu8_epi16(Load4(above));
+ pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+ pixels[2] = _mm_set1_epi16(above[3]);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height,
+ __m128i* weight_h, __m128i* weight_w) {
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i x_weights = Load4(weight_array);
+ weight_h[0] = _mm_cvtepu8_epi16(x_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+ if (height == 8) {
+ const __m128i y_weights = LoadLo8(weight_array + 4);
+ weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ } else if (height == 16) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i y_weights = LoadUnaligned16(weight_array + 12);
+ weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(y_weights, zero);
+ weight_h[3] = _mm_sub_epi16(scale, weight_h[2]);
+ }
+}
+
+inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
+ const __m128i* weight_x, uint8_t* dst,
+ const ptrdiff_t stride,
+ const bool use_second_half) {
+ const __m128i round = _mm_set1_epi32(256);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero)
+ : _mm_unpacklo_epi8(pixel[1], zero);
+ __m128i y_select = _mm_set1_epi16(0x0100);
+
+ for (int i = 0; i < 8; ++i) {
+ const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+ const __m128i interleaved_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights);
+
+ __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select);
+ horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]);
+ __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]);
+
+ sum = _mm_add_epi32(vertical_pred, sum);
+ sum = _mm_add_epi32(sum, round);
+ sum = _mm_srai_epi32(sum, 9);
+
+ sum = _mm_shuffle_epi8(sum, cvtepi32_epi8);
+ Store4(dst, sum);
+ dst += stride;
+
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+// The interleaving approach has some overhead that causes it to underperform in
+// the 4x4 case.
+void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ const __m128i scale = _mm_set1_epi32(256);
+ // Fourth short is top_row[3].
+ const __m128i top_right = _mm_shuffle_epi32(top, 0xFF);
+ // Fourth short is left_column[3].
+ const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ auto* dst = static_cast<uint8_t*>(dest);
+ // AV1 spec 7.11.2.6 (3) describes the sum:
+ // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] +
+ // scaled_bottom[y] This could be a loop, but for the immediate value in the
+ // shuffles.
+ WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left,
+ scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+}
+
+void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i weights_x[1];
+ __m128i weights_y[2];
+ LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x);
+ __m128i pixels[3];
+ LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+}
+
+void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i weights_x[1];
+ __m128i weights_y[4];
+ LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x);
+ __m128i pixels[3];
+ LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left,
+ const int height, __m128i* pixels) {
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
+ pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left);
+ pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left);
+
+ pixels[3] = _mm_set1_epi16(above[7]);
+
+ if (height == 4) {
+ pixels[2] = Load4(left);
+ } else if (height == 8) {
+ pixels[2] = LoadLo8(left);
+ } else if (height == 16) {
+ pixels[2] = LoadUnaligned16(left);
+ } else {
+ pixels[2] = LoadUnaligned16(left);
+ pixels[4] = pixels[0];
+ pixels[5] = pixels[1];
+ pixels[6] = LoadUnaligned16(left + 16);
+ pixels[7] = pixels[3];
+ }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height,
+ __m128i* weight_w, __m128i* weight_h) {
+ const int offset = (height < 8) ? 0 : 4;
+ __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
+ weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+ const __m128i inverter = _mm_set1_epi16(256);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+
+ if (height == 4) {
+ loaded_weights = _mm_srli_si128(loaded_weights, 4);
+ __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights);
+ __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x);
+ weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x);
+ weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x);
+ } else {
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+ weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+ }
+
+ if (height == 16) {
+ const __m128i zero = _mm_setzero_si128();
+ loaded_weights = LoadUnaligned16(weight_array + 12);
+ weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero);
+ weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+ } else if (height == 32) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i weight_lo = LoadUnaligned16(weight_array + 28);
+ weight_h[0] = _mm_cvtepu8_epi16(weight_lo);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+ weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+ const __m128i weight_hi = LoadUnaligned16(weight_array + 44);
+ weight_h[4] = _mm_cvtepu8_epi16(weight_hi);
+ weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]);
+ weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+ weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]);
+ }
+}
+
+inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
+ const __m128i* weights_y, const int height,
+ uint8_t* dst, const ptrdiff_t stride,
+ const bool use_second_half) {
+ const __m128i round = _mm_set1_epi32(256);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero)
+ : _mm_unpacklo_epi8(pixels[2], zero);
+ __m128i y_select = _mm_set1_epi16(0x100);
+
+ for (int i = 0; i < height; ++i) {
+ const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+ const __m128i interleaved_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ const __m128i vertical_sum0 =
+ _mm_madd_epi16(pixels[0], interleaved_weights);
+ const __m128i vertical_sum1 =
+ _mm_madd_epi16(pixels[1], interleaved_weights);
+
+ __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select);
+ horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]);
+ const __m128i horizontal_sum0 =
+ _mm_madd_epi16(horizontal_pixels, weights_x[0]);
+ const __m128i horizontal_sum1 =
+ _mm_madd_epi16(horizontal_pixels, weights_x[1]);
+
+ __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0);
+ sum0 = _mm_add_epi32(sum0, round);
+ sum0 = _mm_srai_epi32(sum0, 9);
+
+ __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1);
+ sum1 = _mm_add_epi32(sum1, round);
+ sum1 = _mm_srai_epi32(sum1, 9);
+
+ sum0 = _mm_packus_epi16(sum0, sum1);
+ sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8);
+ StoreLo8(dst, sum0);
+ dst += stride;
+
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels);
+
+ __m128i weights_x[2], weights_y[2];
+ LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
+}
+
+void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels);
+
+ __m128i weights_x[2], weights_y[2];
+ LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+}
+
+void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels);
+
+ __m128i weights_x[2], weights_y[4];
+ LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+}
+
+void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[8];
+ LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels);
+
+ __m128i weights_x[2], weights_y[8];
+ LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+ dst += stride << 3;
+ WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride,
+ false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride,
+ true);
+}
+
+template <int width, int height>
+void SmoothWxH(void* const dest, const ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
+ const uint8_t* const sm_weights_w = kSmoothWeights + width - 4;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale_value = _mm_set1_epi16(256);
+ const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]);
+ const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]);
+ const __m128i round = _mm_set1_epi32(256);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < height; ++y) {
+ const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+ const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]);
+ const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+ __m128i scaled_bottom_left =
+ _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+ const __m128i weight_left_y =
+ _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+ scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+ scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+ for (int x = 0; x < width; x += 8) {
+ const __m128i top_x = LoadLo8(top_ptr + x);
+ const __m128i weights_x = LoadLo8(sm_weights_w + x);
+ const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+ const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x);
+ const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+ // Here opposite weights and pixels are multiplied, where the order of
+ // interleaving is indicated in the names.
+ __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+ __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+ // |scaled_bottom_left| is always scaled by the same weight each row, so
+ // we only derive |scaled_top_right| values here.
+ const __m128i inverted_weights_x =
+ _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x));
+ const __m128i scaled_top_right =
+ _mm_mullo_epi16(inverted_weights_x, top_right);
+ const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right);
+ const __m128i scaled_top_right_hi =
+ _mm_unpackhi_epi16(scaled_top_right, zero);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+ // The round value for RightShiftWithRounding was added with
+ // |scaled_bottom_left|.
+ pred_lo = _mm_srli_epi32(pred_lo, 9);
+ pred_hi = _mm_srli_epi32(pred_hi, 9);
+ const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+ StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+ }
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr));
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top[3]);
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top[3]);
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 8));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 12));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ __m128i y_mask = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+}
+
+void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ const __m128i left2 =
+ _mm_cvtepu8_epi16(LoadLo8(static_cast<const uint8_t*>(left_column) + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ }
+}
+
+inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left,
+ const int height, __m128i* pixels) {
+ __m128i top = Load4(above);
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ top = _mm_cvtepu8_epi16(top);
+ pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array,
+ const int height, __m128i* weights) {
+ const __m128i inverter = _mm_set1_epi16(256);
+
+ if (height == 4) {
+ const __m128i weight = Load4(weight_array);
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else if (height == 8) {
+ const __m128i weight = LoadLo8(weight_array + 4);
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else {
+ const __m128i weight = LoadUnaligned16(weight_array + 12);
+ const __m128i zero = _mm_setzero_si128();
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ weights[2] = _mm_unpackhi_epi8(weight, zero);
+ weights[3] = _mm_sub_epi16(inverter, weights[2]);
+ }
+}
+
+inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
+ const int height, uint8_t* dst,
+ const ptrdiff_t stride) {
+ const __m128i pred_round = _mm_set1_epi32(128);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+ __m128i y_select = _mm_set1_epi16(0x0100);
+
+ for (int y = 0; y < height; ++y) {
+ const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+ const __m128i alternate_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+ // The madd instruction yields four results of the form:
+ // (top_row[x] * weight[y] + corner * inverted_weight[y])
+ __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+ sum = _mm_add_epi32(sum, pred_round);
+ sum = _mm_srai_epi32(sum, 8);
+ sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+ Store4(dst, sum);
+ dst += stride;
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 4, &pixels);
+
+ __m128i weights[2];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
+}
+
+void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 8, &pixels);
+
+ __m128i weights[2];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+}
+
+void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 16, &pixels);
+
+ __m128i weights[4];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+ dst += stride << 3;
+ WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
+}
+
+void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+ const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+}
+
+void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+ const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+}
+
+void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Smooth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Smooth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Smooth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Smooth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Smooth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Smooth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Smooth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ SmoothWxH<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ SmoothWxH<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ SmoothWxH<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ SmoothWxH<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ SmoothWxH<16, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ SmoothWxH<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ SmoothWxH<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ SmoothWxH<32, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ SmoothWxH<32, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ SmoothWxH<64, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ SmoothWxH<64, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ SmoothWxH<64, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x64_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
new file mode 100644
index 0000000..9938dfe
--- /dev/null
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -0,0 +1,3535 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring> // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Utility Functions
+
+// This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
+// Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
+// block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
+// we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
+// bits.
+constexpr int kThreeInverse = 0x5556;
+constexpr int kFiveInverse = 0x3334;
+template <int shiftk, int multiplier>
+inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
+ const __m128i interm = _mm_srli_epi32(dividend, shiftk);
+ return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
+}
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1
+
+using DcSumFunc = __m128i (*)(const void* ref);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
+using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
+ const __m128i column);
+// For copying an entire column across a block.
+using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
+ const void* column);
+
+// DC intra-predictors for non-square blocks.
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+struct DcPredFuncs_SSE4_1 {
+ DcPredFuncs_SSE4_1() = delete;
+
+ static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+// Directional intra-predictors for square blocks.
+template <ColumnStoreFunc col_storefn>
+struct DirectionalPredFuncs_SSE4_1 {
+ DirectionalPredFuncs_SSE4_1() = delete;
+
+ static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+ shiftk, dc_mult>::DcTop(void* const dest,
+ ptrdiff_t stride,
+ const void* const top_row,
+ const void* /*left_column*/) {
+ const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
+ const __m128i sum = top_sumfn(top_row);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+ shiftk,
+ dc_mult>::DcLeft(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
+ const __m128i sum = left_sumfn(left_column);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+ shiftk, dc_mult>::Dc(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i rounder =
+ _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
+ const __m128i sum_top = top_sumfn(top_row);
+ const __m128i sum_left = left_sumfn(left_column);
+ const __m128i sum = _mm_add_epi32(sum_top, sum_left);
+ if (width_log2 == height_log2) {
+ const __m128i dc =
+ _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
+ storefn(dest, stride, dc);
+ } else {
+ const __m128i dc =
+ DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
+ storefn(dest, stride, dc);
+ }
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1 directional predictors
+
+template <ColumnStoreFunc col_storefn>
+void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
+ void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+ const void* const left_column) {
+ col_storefn(dest, stride, left_column);
+}
+
+} // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// |ref| points to 4 bytes containing 4 packed ints.
+inline __m128i DcSum4_SSE4_1(const void* const ref) {
+ const __m128i vals = Load4(ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum8_SSE4_1(const void* const ref) {
+ const __m128i vals = LoadLo8(ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum16_SSE4_1(const void* const ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals = LoadUnaligned16(ref);
+ const __m128i partial_sum = _mm_sad_epu8(vals, zero);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum32_SSE4_1(const void* const ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals1 = LoadUnaligned16(ref);
+ const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
+ const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+ const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+ const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum64_SSE4_1(const void* const ref) {
+ const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals1 = LoadUnaligned16(ref_ptr);
+ const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
+ const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
+ const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
+ const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+ const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+ __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+ const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
+ partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
+ const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
+ partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ Store4(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ Store4(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreLo8(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreLo8(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+}
+
+template <int height>
+inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ StoreUnaligned16(dst + 32, dc_dup);
+ StoreUnaligned16(dst + 48, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ StoreUnaligned16(dst + 32, dc_dup);
+ StoreUnaligned16(dst + 48, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
+// be copied for width N into dest.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ Store4(dst, dup32);
+ dst += stride;
+ const int row1 = _mm_extract_epi32(dup32, 1);
+ memcpy(dst, &row1, 4);
+ dst += stride;
+ const int row2 = _mm_extract_epi32(dup32, 2);
+ memcpy(dst, &row2, 4);
+ dst += stride;
+ const int row3 = _mm_extract_epi32(dup32, 3);
+ memcpy(dst, &row3, 4);
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+ auto* dst = static_cast<uint8_t*>(dest);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+ dst += stride;
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const __m128i col_data = Load4(column);
+ const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
+ writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i col_data = LoadLo8(column);
+ const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
+ writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 32; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo =
+ _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi =
+ _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo =
+ _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi =
+ _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 64; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo =
+ _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi =
+ _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo =
+ _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi =
+ _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+ dst += stride4;
+ }
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+ DcStore4xH_SSE4_1<4>, 0, 0>;
+ // shiftk is the smaller of width_log2 and height_log2.
+ // dc_mult corresponds to the ratio of the smaller block size to the larger.
+ using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
+ DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
+ using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
+ DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
+
+ using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
+ DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
+ using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
+ DcStore8xH_SSE4_1<8>, 0, 0>;
+ using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
+ DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
+ using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
+ DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
+
+ using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
+ DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
+ using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
+ DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
+ using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
+ DcStore16xH_SSE4_1<16>, 0, 0>;
+ using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
+ DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
+ using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
+ DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
+
+ using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
+ DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
+ using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
+ DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
+ using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
+ DcStore32xH_SSE4_1<32>, 0, 0>;
+ using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
+ DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
+
+ using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
+ DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
+ using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
+ DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
+ using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
+ DcStore64xH_SSE4_1<64>, 0, 0>;
+};
+
+struct DirDefs {
+ DirDefs() = delete;
+
+ using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+ using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+ using _4x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+ using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+ using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+ using _8x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+ using _8x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+ using _16x4 =
+ DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+ using _16x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+ using _16x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+ using _16x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+ using _16x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+ using _32x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+ using _32x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+ using _32x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+ using _32x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+ using _64x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+ using _64x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+ using _64x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+template <int y_mask>
+inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists, const __m128i& left_dists,
+ const __m128i& top_left_diffs) {
+ const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
+
+ const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i top_left_dists =
+ _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
+ not_select_left =
+ _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
+ const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
+
+ const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+ // The sequence of 32-bit packed operations was found (see CL via blame) to
+ // outperform 16-bit operations, despite the availability of the packus
+ // function, when tested on a Xeon E7 v3.
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ const __m128i pred = _mm_shuffle_epi8(
+ _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
+ Store4(dst, pred);
+}
+
+// top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
+// we would be able to do all of these operations as epi8 for a 16-pixel version
+// of this function. Still, since lefts_y is just a vector of duplicates, it
+// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
+// for the blends.
+template <int y_mask>
+inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists, const __m128i& left_dists,
+ const __m128i& top_left_diffs) {
+ const __m128i select_y = _mm_set1_epi32(y_mask);
+ const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
+
+ const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
+ const __m128i top_left_dists =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
+ not_select_left =
+ _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
+ const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
+
+ const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+ const __m128i pred = _mm_packus_epi16(
+ _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+// |top| is an epi8 of length 16
+// |left| is epi8 of unknown length, as y_mask specifies access
+// |top_lefts| is an epi8 of 16 duplicates
+// |top_dists| is an epi8 of unknown length, as y_mask specifies access
+// |left_dists| is an epi8 of length 16
+// |left_dists_lo| is an epi16 of length 8
+// |left_dists_hi| is an epi16 of length 8
+// |top_left_diffs_lo| is an epi16 of length 8
+// |top_left_diffs_hi| is an epi16 of length 8
+// The latter two vectors are epi16 because their values may reach -510.
+// |left_dists| is provided alongside its spread out version because it doesn't
+// change between calls and interacts with both kinds of packing.
+template <int y_mask>
+inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists,
+ const __m128i& left_dists,
+ const __m128i& left_dists_lo,
+ const __m128i& left_dists_hi,
+ const __m128i& top_left_diffs_lo,
+ const __m128i& top_left_diffs_hi) {
+ const __m128i select_y = _mm_set1_epi32(y_mask);
+ const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
+ const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
+ const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
+ const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
+
+ const __m128i top_left_dists_lo =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
+ const __m128i top_left_dists_hi =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
+
+ const __m128i left_gt_top_left_lo = _mm_packs_epi16(
+ _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
+ const __m128i left_gt_top_left_hi =
+ _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
+ /* unused second arg for pack */ left_dists_hi);
+ const __m128i left_gt_top_left = _mm_alignr_epi8(
+ left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
+
+ const __m128i not_select_top_lo =
+ _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
+ /* unused second arg for pack */ top_dists_y16);
+ const __m128i not_select_top_hi =
+ _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
+ /* unused second arg for pack */ top_dists_y16);
+ const __m128i not_select_top = _mm_alignr_epi8(
+ not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
+
+ const __m128i left_leq_top =
+ _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
+ const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
+ const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const __m128i left = LoadLo8(left_column);
+ const __m128i left_lo = _mm_cvtepu8_epi32(left);
+ const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
+ const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+}
+
+void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i left_0 = _mm_cvtepu8_epi32(left);
+ const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+ const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
+ const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
+
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
+ const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
+ const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
+ const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i left_lo = _mm_cvtepu8_epi16(left);
+ const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
+ const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+}
+
+void Paeth8x32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
+ Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
+}
+
+void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = Load4(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+// Inlined for calling with offsets in larger transform sizes, mainly to
+// preserve top_left.
+inline void WritePaeth16x8(void* const dest, ptrdiff_t stride,
+ const uint8_t top_left, const __m128i top,
+ const __m128i left) {
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top_left,
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i left = LoadLo8(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
+ const __m128i top, const __m128i left) {
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left_0 = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top, left_0);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
+}
+
+void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const ptrdiff_t stride16 = stride << 4;
+ const __m128i left_0 = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top, left_0);
+ dst += stride16;
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ WritePaeth16x16(dst, stride, top_left, top, left_1);
+ dst += stride16;
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ WritePaeth16x16(dst, stride, top_left, top, left_2);
+ dst += stride16;
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ WritePaeth16x16(dst, stride, top_left, top, left_3);
+}
+
+void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadLo8(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x8(dst, stride, top_left, top_0, left);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+}
+
+void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+}
+
+void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
+}
+
+void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+}
+
+void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
+}
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int width,
+ const int height) {
+ ptrdiff_t offset = 1;
+ if (height == 4) {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ return;
+ }
+ int y = 0;
+ do {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ dst += stride;
+ memcpy(dst, top + offset + 4, width);
+ dst += stride;
+ memcpy(dst, top + offset + 5, width);
+ dst += stride;
+ memcpy(dst, top + offset + 6, width);
+ dst += stride;
+ memcpy(dst, top + offset + 7, width);
+ dst += stride;
+
+ offset += 8;
+ y += 8;
+ } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const int rounding_bits = 5;
+ const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+ const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+ const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+ : _mm_set_epi64x(0, 0x0403030202010100);
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ int y = 0;
+ int top_x = xstep;
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> scale_bits;
+
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+ // Load 8 values because we will select the sampled values based on
+ // |upsampled|.
+ const __m128i values = LoadLo8(top + top_base_x);
+ const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+ prod = RightShiftWithRounding_U16(prod, rounding_bits);
+ // Replace pixels from invalid range with top-right corner.
+ prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+ Store4(dst, _mm_packus_epi16(prod, prod));
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dst, top[max_base_x], /* width */ 4);
+ dst += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+ for (; x < min_corner_only_x;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ // Corner-only section of the row.
+ memset(dest + x, top_row[max_base_x], width - x);
+ }
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (xstep == 64) {
+ DirectionalZone1_Step64(dest, stride, top_row, width, height);
+ return;
+ }
+ if (width == 4) {
+ DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+ return;
+ }
+ if (width >= 32) {
+ DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+ upsampled);
+ return;
+ }
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // No need to check for exceeding |max_base_x| in the loops.
+ if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+ return;
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+ return;
+ }
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ for (; x < width - 8;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(top_row + top_base_x);
+ } else {
+ const __m128i top_vals = LoadLo8(top_row + top_base_x);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+ upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const int rounding_bits = 5;
+
+ __m128i result_block[4];
+ for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadLo8(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadLo8(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ result_block[x] = _mm_packus_epi16(vals, vals);
+ }
+ const __m128i result = Transpose4x4_U8(result_block);
+ // This is result_row0.
+ Store4(dest, result);
+ dest += stride;
+ const int result_row1 = _mm_extract_epi32(result, 1);
+ memcpy(dest, &result_row1, sizeof(result_row1));
+ dest += stride;
+ const int result_row2 = _mm_extract_epi32(result, 2);
+ memcpy(dest, &result_row2, sizeof(result_row2));
+ dest += stride;
+ const int result_row3 = _mm_extract_epi32(result, 3);
+ memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler =
+ _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const int rounding_bits = 5;
+
+ __m128i result_block[8];
+ for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+ }
+ Transpose8x8_U16(result_block, result_block);
+ for (int y = 0; y < height; ++y) {
+ StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (width == 4 || height == 4) {
+ const ptrdiff_t stride4 = stride << 2;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<true>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+ ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ }
+ return;
+ }
+
+ const ptrdiff_t stride8 = stride << 3;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<true, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<false, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds) {
+ const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds,
+ const __m128i& bounds_selector) {
+ const __m128i max_dest_x_vect =
+ _mm_shuffle_epi8(zone_bounds, bounds_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+ const __m128i& shifts,
+ const __m128i& sampler) {
+ const __m128i src_vals = LoadUnaligned16(source);
+ __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+ const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+ // Left_column and sampler are both offset by 15 so the indices are always
+ // positive.
+ const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+ for (int y = 0; y < 4; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+ // can work as shuffle indices. Some values may be out of bounds, but their
+ // pred results will be masked over by top prediction.
+ sampler = _mm_add_epi8(sampler, positive_offset);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column + (y << upsample_shift), shifts, sampler);
+ Store4(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+ 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_set1_epi8(1);
+ const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+ for (int y = 0; y < 8; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+ // Offset the relative index because ystep is negative in Zone 2 and shuffle
+ // indices must be nonnegative.
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ sampler = _mm_add_epi8(sampler, denegation);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+ // The specification adds (y << 6) to left_y, which is subject to
+ // upsampling, but this puts sampler indices out of the 0-15 range. It is
+ // equivalent to offset the source address by (y << upsample_shift) instead.
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+ sampler);
+ StoreLo8(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+ top_x -= xstep;
+
+ int top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+ DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+ DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+ DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+ DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ __m128i y_selector = _mm_set1_epi32(0x01000100);
+ const __m128i index_increment = _mm_set1_epi32(0x02020202);
+ for (int y = 0; y < height; ++y,
+ y_selector = _mm_add_epi8(y_selector, index_increment),
+ dest += stride) {
+ top_x -= xstep;
+ const int top_base_x = top_x >> scale_bits_x;
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+ DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+ }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride8 = stride << 3;
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute. This assumes minimum |xstep| is 3.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ // For steep angles, the source pixels from left_column may not fit in a
+ // 16-byte load for shuffling.
+ // TODO(petersonab): Find a more precise formula for this subject to x.
+ const int max_shuffle_height =
+ std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
+
+ const int xstep8 = xstep << 3;
+ const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+ // Accumulate xstep across 8 rows.
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep8 = ystep << 3;
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+
+ const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+ int x = 0;
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ for (int left_offset = -left_base_increment; x < min_top_only_x;
+ x += 8,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+ // Watch left_y because it can still get big.
+ left_y = _mm_add_epi16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ DirectionalZone1_4xH(dst_x + 4, stride,
+ top_row + ((x + 4) << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+ // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+ // high. This means that max_shuffle_height is unbounded and xstep_bounds
+ // will overflow in 16 bits. This is prevented by stopping the first
+ // blending loop at min_left_only_y for such cases, which means we skip over
+ // the second blending loop as well.
+ const int left_shuffle_stop_y =
+ std::min(max_shuffle_height, min_left_only_y);
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ for (; y < left_shuffle_stop_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Pick up from the last y-value, using the 10% slower but secure method for
+ // left prediction.
+ const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+ }
+ }
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ const int xstep4 = xstep << 2;
+ const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+ __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep4 = ystep << 2;
+ const int left_base_increment4 = ystep4 >> 6;
+ // This is guaranteed to be less than 64, but accumulation may bring it past
+ // 64 for higher x values.
+ const int ystep_remainder4 = ystep4 & 0x3F;
+ const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+ const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which will go into the left_column offset.
+ // Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+ int x = 0;
+ // Loop over x for columns with a mixture of sources.
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+ left_y = _mm_add_epi16(left_y, increment_left4),
+ left_offset -= left_base_increment4) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute. Rounded up to the nearest multiple of 4.
+ const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ // Loop over y for mixed rows.
+ for (; y < min_left_only_y;
+ y += 4, dst_x += stride4,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+ top_x -= xstep4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+ left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_4x4<upsampled_top>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left-only rows, if any.
+ for (; y < height; y += 4, dst_x += stride4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+ }
+ }
+ // Loop over top-only columns, if any.
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint8_t top_buffer[288];
+ uint8_t left_buffer[288];
+ memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+ const uint8_t* top_ptr = top_buffer + 144;
+ const uint8_t* left_ptr = left_buffer + 144;
+ if (width == 4 || height == 4) {
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+ return;
+ }
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
+ const __m128i& pixels, const __m128i& taps_0_1,
+ const __m128i& taps_2_3, const __m128i& taps_4_5,
+ const __m128i& taps_6_7) {
+ const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+ const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+ // |output_half| contains 8 partial sums.
+ __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+ __m128i output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row0 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* arbitrary pack arg */ output);
+ Store4(dst, output_row0);
+ const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+ const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+ output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+ output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row1 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* arbitrary pack arg */ output);
+ Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. This implementation
+// loads TL from the top row for the first block, so it is not
+inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_ptr,
+ const uint8_t* const left_ptr, FilterIntraPredictor pred,
+ const int height) {
+ const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
+ __m128i top = Load4(top_ptr - 1);
+ __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+ __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+ left = _mm_slli_si128(left, 5);
+
+ // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+ // left[2], left[3], left[4], left[5], left[6], left[7]
+ pixels = _mm_or_si128(left, pixels);
+
+ // Duplicate first 8 bytes.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 1.
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+ // left[0], left[1], ...
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+ // byte is an unused value, which shall be multiplied by 0 when we apply the
+ // filter.
+ constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+ // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 2.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 3.
+
+ // Compute the middle 8 rows before using common code for the final 4 rows.
+ // Because the common code below this block assumes that
+ if (height == 16) {
+ // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+ left = _mm_slli_si128(left, 1);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+ // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+ // last byte is an unused value, as above. The top-left was shifted to
+ // position nine to keep two empty spaces after the top pixels.
+ constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+ // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+ // the end.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 4.
+
+ // First 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Clear all but final pixel in the first 8 of left column.
+ __m128i keep_top_left = _mm_srli_si128(left, 13);
+ dest += stride; // Move to y = 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+ // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+ pixels = _mm_or_si128(left, pixels);
+ left = LoadLo8(left_ptr + 8);
+
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 6.
+
+ // Second 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Position TL value so we can use pixel_order1.
+ keep_top_left = _mm_slli_si128(keep_top_left, 6);
+ dest += stride; // Move to y = 7.
+ pixels = Load4(dest);
+ left = _mm_slli_si128(left, 7);
+ left = _mm_or_si128(left, keep_top_left);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 8.
+
+ // Third 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 9.
+
+ // Prepare final inputs.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 10.
+
+ // Fourth 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 11.
+ }
+
+ // In both the 8 and 16 case, we assume that the left vector has the next TL
+ // at position 8.
+ if (height > 4) {
+ // Erase prior left pixels by shifting TL to position 0.
+ left = _mm_srli_si128(left, 8);
+ left = _mm_slli_si128(left, 6);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 12 or 4.
+
+ // First of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 13 or 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 14 or 6.
+
+ // Last of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ }
+}
+
+void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ FilterIntraPredictor pred, const int width,
+ const int height) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (width == 4) {
+ Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+ return;
+ }
+
+ // There is one set of 7 taps for each of the 4x2 output pixels.
+ const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
+
+ // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+ // the end is an unused value, which shall be multiplied by 0 when we apply
+ // the filter.
+ constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+ // Takes the "left section" and puts it right after p0-p4.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+ // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+ // byte is unused as above.
+ constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+ // Shuffles the "top left" from the left section, to the front. Used when
+ // grabbing data from left_column and not top_row.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+ // This first pass takes care of the cases where the top left pixel comes from
+ // top_row.
+ __m128i pixels = LoadLo8(top_ptr - 1);
+ __m128i left = _mm_slli_si128(Load4(left_column), 8);
+ pixels = _mm_or_si128(pixels, left);
+
+ // Two sets of the same pixels to multiply with two sets of taps.
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+ left = _mm_srli_si128(left, 1);
+
+ // Load
+ pixels = Load4(dst + stride);
+
+ // Because of the above shift, this OR 'invades' the final of the first 8
+ // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+ // a padded 0.
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ const ptrdiff_t stride2 = stride << 1;
+ const ptrdiff_t stride4 = stride << 2;
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dst += 4;
+ for (int x = 3; x < width - 4; x += 4) {
+ pixels = Load4(top_ptr + x);
+ pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+
+ // Now we handle heights that reference previous blocks rather than top_row.
+ for (int y = 4; y < height; y += 4) {
+ // Leftmost 4x4 block for this height.
+ dst -= width;
+ dst += stride4;
+
+ // Top Left is not available by offset in these leftmost blocks.
+ pixels = Load4(dst - stride);
+ left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+ left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+ left = _mm_srli_si128(left, 2);
+ pixels = Load4(dst + stride);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+
+ dst += 4;
+
+ // Remaining 4x4 blocks for this height.
+ for (int x = 4; x < width; x += 4) {
+ pixels = Load4(dst - stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+ dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DcDefs::_4x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DcDefs::_4x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DcDefs::_8x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DcDefs::_8x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DcDefs::_8x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DcDefs::_8x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DcDefs::_16x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DcDefs::_16x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DcDefs::_16x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DcDefs::_16x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DcDefs::_16x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DcDefs::_32x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DcDefs::_32x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DcDefs::_32x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DcDefs::_32x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DcDefs::_64x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DcDefs::_64x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DcDefs::_64x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DcDefs::_4x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DcDefs::_4x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DcDefs::_8x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DcDefs::_8x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DcDefs::_8x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DcDefs::_8x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DcDefs::_16x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DcDefs::_16x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DcDefs::_16x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DcDefs::_16x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DcDefs::_16x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DcDefs::_32x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DcDefs::_32x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DcDefs::_32x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DcDefs::_32x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DcDefs::_64x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DcDefs::_64x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DcDefs::_64x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DcDefs::_4x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DcDefs::_4x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DcDefs::_8x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DcDefs::_8x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DcDefs::_8x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DcDefs::_8x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DcDefs::_16x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DcDefs::_16x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DcDefs::_16x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DcDefs::_16x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DcDefs::_16x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DcDefs::_32x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DcDefs::_32x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DcDefs::_32x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DcDefs::_32x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DcDefs::_64x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DcDefs::_64x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DcDefs::_64x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Paeth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Paeth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Paeth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Paeth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Paeth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Paeth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Paeth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ Paeth16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ Paeth16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ Paeth16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ Paeth16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ Paeth16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ Paeth32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ Paeth32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ Paeth32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ Paeth32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ Paeth64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ Paeth64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ Paeth64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DirDefs::_64x64::Horizontal;
+#endif
+} // NOLINT(readability/fn_size)
+// TODO(petersonab): Split Init8bpp function into family-specific files.
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreLo8(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreLo8(dst, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
+// identical shorts that need N total copies written into dest. The unpacking
+// works the same as in the 8bpp case, except that each 32-bit unit needs twice
+// as many copies.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ auto* dst = static_cast<uint8_t*>(dest);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+ dst += stride;
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
+ }
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
+ }
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
+ }
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
+ }
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding row in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const __m128i col_data = LoadLo8(column);
+ const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
+ writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const __m128i col_data = LoadUnaligned16(column);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lo);
+ const ptrdiff_t stride4 = stride << 2;
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 32; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 64; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 128; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+// |ref| points to 8 bytes containing 4 packed int16 values.
+inline __m128i DcSum4_SSE4_1(const void* ref) {
+ const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
+ const __m128i ones = _mm_set1_epi16(1);
+
+ // half_sum[31:0] = a1+a2
+ // half_sum[63:32] = a3+a4
+ const __m128i half_sum = _mm_madd_epi16(vals, ones);
+ // Place half_sum[63:32] in shift_sum[31:0].
+ const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
+ return _mm_add_epi32(half_sum, shift_sum);
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+ DcStore4xH_SSE4_1<4>, 0, 0>;
+};
+
+struct DirDefs {
+ DirDefs() = delete;
+
+ using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+ using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+ using _4x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+ using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+ using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+ using _8x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+ using _8x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+ using _16x4 =
+ DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+ using _16x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+ using _16x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+ using _16x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+ using _16x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+ using _32x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+ using _32x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+ using _32x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+ using _32x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+ using _64x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+ using _64x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+ using _64x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DirDefs::_64x64::Horizontal;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h
new file mode 100644
index 0000000..7f4fcd7
--- /dev/null
+++ b/src/dsp/x86/intrapred_sse4.h
@@ -0,0 +1,1060 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor, see the defines below for specifics. These
+// functions are not thread-safe.
+void IntraPredInit_SSE4_1();
+void IntraPredCflInit_SSE4_1();
+void IntraPredSmoothInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
new file mode 100644
index 0000000..787d706
--- /dev/null
+++ b/src/dsp/x86/inverse_transform_sse4.cc
@@ -0,0 +1,3086 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
+ const __m128i* s) {
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (store_width == 16) {
+ for (int i = 0; i < store_count; i += 4) {
+ StoreUnaligned16(&dst[i * stride + idx], s[i]);
+ StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
+ StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
+ StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+ }
+ if (store_width == 8) {
+ for (int i = 0; i < store_count; i += 4) {
+ StoreLo8(&dst[i * stride + idx], s[i]);
+ StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
+ StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
+ StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+ }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
+ int32_t idx, __m128i* x) {
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (load_width == 16) {
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = LoadUnaligned16(&src[i * stride + idx]);
+ x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
+ x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
+ x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
+ }
+ }
+ if (load_width == 8) {
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = LoadLo8(&src[i * stride + idx]);
+ x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
+ x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
+ x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
+ }
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i psin_pcos = _mm_set1_epi32(
+ static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+ const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+ const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+ const __m128i sign =
+ _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ // -sin cos, -sin cos, -sin cos, -sin cos
+ const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+ const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+ const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+ const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+ const __m128i x = _mm_packs_epi32(x1, x1);
+ const __m128i y = _mm_packs_epi32(y1, y1);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i psin_pcos = _mm_set1_epi32(
+ static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+ const __m128i sign =
+ _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ // -sin cos, -sin cos, -sin cos, -sin cos
+ const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+ const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+ const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+ const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
+ const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
+ const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+ const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+ const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
+ const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
+ const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+ const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
+ const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
+ const __m128i x = _mm_packs_epi32(x1, x1_hi);
+ const __m128i y = _mm_packs_epi32(y1, y1_hi);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+ const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
+ const __m128i x = _mm_mulhrs_epi16(*b, psin);
+ const __m128i y = _mm_mulhrs_epi16(*b, pcos);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
+ __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+ const __m128i psin = _mm_set1_epi16(sin128 << 3);
+ const __m128i x = _mm_mulhrs_epi16(*a, pcos);
+ const __m128i y = _mm_mulhrs_epi16(*a, psin);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
+ __m128i x, y;
+ if (flip) {
+ y = _mm_adds_epi16(*b, *a);
+ x = _mm_subs_epi16(*b, *a);
+ } else {
+ x = _mm_adds_epi16(*a, *b);
+ y = _mm_subs_epi16(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
+ bool flip);
+
+LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
+ const __m128i v_row_shift_add,
+ const __m128i v_row_shift) {
+ const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
+ // The max row_shift is 2, so int16_t values greater than 0x7ffd may
+ // overflow. Generate a mask for this case.
+ const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
+ const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
+ // Assume int16_t values.
+ const __m128i a = _mm_sra_epi16(x, v_row_shift);
+ // Assume uint16_t values.
+ const __m128i b = _mm_srl_epi16(x, v_row_shift);
+ // Select the correct shifted value.
+ return _mm_blendv_epi8(a, b, mask);
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_src =
+ (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+ const int16_t cos128 = Cos128(32);
+ const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
+
+ // Expand to 32 bits to prevent int16_t overflows during the shift add.
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_cvtepi16_epi32(xy);
+ const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
+ const __m128i b = _mm_add_epi32(a, v_row_shift_add);
+ const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
+ const __m128i c = _mm_sra_epi32(b, v_row_shift);
+ const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
+ const __m128i xy_shifted = _mm_packs_epi32(c, c1);
+
+ if (width == 4) {
+ StoreLo8(dst, xy_shifted);
+ } else {
+ for (int i = 0; i < width; i += 8) {
+ StoreUnaligned16(dst, xy_shifted);
+ dst += 8;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const __m128i v_src = LoadLo8(dst);
+ const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+ StoreLo8(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&dst[i]);
+ const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+ StoreUnaligned16(&dst[i], xy);
+ i += 8;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+}
+
+// Process 4 dct4 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[4], x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<8, 8>(dst, step, 0, input);
+ Transpose4x8To8x4_U16(input, x);
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ }
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x4To4x8_U16(s, output);
+ StoreDst<8, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ Transpose4x4_U16(s, s);
+ }
+ StoreDst<8, 4>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false);
+ HadamardRotation(&s[6], &s[7], true);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, 0, input);
+ Transpose8x8_U16(input, x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x8_U16(s, output);
+ StoreDst<16, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, s);
+ }
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false);
+ HadamardRotation(&s[10], &s[11], true);
+ HadamardRotation(&s[12], &s[13], false);
+ HadamardRotation(&s[14], &s[15], true);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false);
+ HadamardRotation(&s[9], &s[10], false);
+ HadamardRotation(&s[12], &s[15], true);
+ HadamardRotation(&s[13], &s[14], true);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8_U16(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+ Dct16Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4_U16(&s[8], output);
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, s);
+ }
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false);
+ HadamardRotation(&s[18], &s[19], true);
+ HadamardRotation(&s[20], &s[21], false);
+ HadamardRotation(&s[22], &s[23], true);
+ HadamardRotation(&s[24], &s[25], false);
+ HadamardRotation(&s[26], &s[27], true);
+ HadamardRotation(&s[28], &s[29], false);
+ HadamardRotation(&s[30], &s[31], true);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false);
+ HadamardRotation(&s[17], &s[18], false);
+ HadamardRotation(&s[20], &s[23], true);
+ HadamardRotation(&s[21], &s[22], true);
+ HadamardRotation(&s[24], &s[27], false);
+ HadamardRotation(&s[25], &s[26], false);
+ HadamardRotation(&s[28], &s[31], true);
+ HadamardRotation(&s[29], &s[30], true);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false);
+ HadamardRotation(&s[17], &s[22], false);
+ HadamardRotation(&s[18], &s[21], false);
+ HadamardRotation(&s[19], &s[20], false);
+ HadamardRotation(&s[24], &s[31], true);
+ HadamardRotation(&s[25], &s[30], true);
+ HadamardRotation(&s[26], &s[29], true);
+ HadamardRotation(&s[27], &s[28], true);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
+ const bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[32], x[32];
+
+ if (transpose) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_8>(s);
+ Dct8Stages<ButterflyRotation_8>(s);
+ Dct16Stages<ButterflyRotation_8>(s);
+ Dct32Stages<ButterflyRotation_8>(s);
+
+ if (transpose) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 32>(dst, step, 0, s);
+ }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[64], x[32];
+
+ if (transpose) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false);
+ HadamardRotation(&s[34], &s[35], true);
+ HadamardRotation(&s[36], &s[37], false);
+ HadamardRotation(&s[38], &s[39], true);
+ HadamardRotation(&s[40], &s[41], false);
+ HadamardRotation(&s[42], &s[43], true);
+ HadamardRotation(&s[44], &s[45], false);
+ HadamardRotation(&s[46], &s[47], true);
+ HadamardRotation(&s[48], &s[49], false);
+ HadamardRotation(&s[50], &s[51], true);
+ HadamardRotation(&s[52], &s[53], false);
+ HadamardRotation(&s[54], &s[55], true);
+ HadamardRotation(&s[56], &s[57], false);
+ HadamardRotation(&s[58], &s[59], true);
+ HadamardRotation(&s[60], &s[61], false);
+ HadamardRotation(&s[62], &s[63], true);
+
+ // stage 7.
+ ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false);
+ HadamardRotation(&s[33], &s[34], false);
+ HadamardRotation(&s[36], &s[39], true);
+ HadamardRotation(&s[37], &s[38], true);
+ HadamardRotation(&s[40], &s[43], false);
+ HadamardRotation(&s[41], &s[42], false);
+ HadamardRotation(&s[44], &s[47], true);
+ HadamardRotation(&s[45], &s[46], true);
+ HadamardRotation(&s[48], &s[51], false);
+ HadamardRotation(&s[49], &s[50], false);
+ HadamardRotation(&s[52], &s[55], true);
+ HadamardRotation(&s[53], &s[54], true);
+ HadamardRotation(&s[56], &s[59], false);
+ HadamardRotation(&s[57], &s[58], false);
+ HadamardRotation(&s[60], &s[63], true);
+ HadamardRotation(&s[61], &s[62], true);
+
+ // stage 16.
+ ButterflyRotation_8(&s[61], &s[34], 56, true);
+ ButterflyRotation_8(&s[60], &s[35], 56, true);
+ ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false);
+ HadamardRotation(&s[33], &s[38], false);
+ HadamardRotation(&s[34], &s[37], false);
+ HadamardRotation(&s[35], &s[36], false);
+ HadamardRotation(&s[40], &s[47], true);
+ HadamardRotation(&s[41], &s[46], true);
+ HadamardRotation(&s[42], &s[45], true);
+ HadamardRotation(&s[43], &s[44], true);
+ HadamardRotation(&s[48], &s[55], false);
+ HadamardRotation(&s[49], &s[54], false);
+ HadamardRotation(&s[50], &s[53], false);
+ HadamardRotation(&s[51], &s[52], false);
+ HadamardRotation(&s[56], &s[63], true);
+ HadamardRotation(&s[57], &s[62], true);
+ HadamardRotation(&s[58], &s[61], true);
+ HadamardRotation(&s[59], &s[60], true);
+
+ // stage 25.
+ ButterflyRotation_8(&s[59], &s[36], 48, true);
+ ButterflyRotation_8(&s[58], &s[37], 48, true);
+ ButterflyRotation_8(&s[57], &s[38], 48, true);
+ ButterflyRotation_8(&s[56], &s[39], 48, true);
+ ButterflyRotation_8(&s[55], &s[40], 112, true);
+ ButterflyRotation_8(&s[54], &s[41], 112, true);
+ ButterflyRotation_8(&s[53], &s[42], 112, true);
+ ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false);
+ HadamardRotation(&s[33], &s[46], false);
+ HadamardRotation(&s[34], &s[45], false);
+ HadamardRotation(&s[35], &s[44], false);
+ HadamardRotation(&s[36], &s[43], false);
+ HadamardRotation(&s[37], &s[42], false);
+ HadamardRotation(&s[38], &s[41], false);
+ HadamardRotation(&s[39], &s[40], false);
+ HadamardRotation(&s[48], &s[63], true);
+ HadamardRotation(&s[49], &s[62], true);
+ HadamardRotation(&s[50], &s[61], true);
+ HadamardRotation(&s[51], &s[60], true);
+ HadamardRotation(&s[52], &s[59], true);
+ HadamardRotation(&s[53], &s[58], true);
+ HadamardRotation(&s[54], &s[57], true);
+ HadamardRotation(&s[55], &s[56], true);
+
+ // stage 30.
+ ButterflyRotation_8(&s[55], &s[40], 32, true);
+ ButterflyRotation_8(&s[54], &s[41], 32, true);
+ ButterflyRotation_8(&s[53], &s[42], 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 32, true);
+ ButterflyRotation_8(&s[50], &s[45], 32, true);
+ ButterflyRotation_8(&s[49], &s[46], 32, true);
+ ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+ }
+ //-- end dct 64 stages
+
+ if (transpose) {
+ for (int idx = 0; idx < 64; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 64>(dst, step, 0, s);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<8, 8>(dst, step, 0, input);
+ Transpose4x8To8x4_U16(input, x);
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ }
+
+ const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
+ const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
+ const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
+ const __m128i kAdst4Multiplier_m0_1 =
+ _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
+ (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
+ const __m128i kAdst4Multiplier_3_0 =
+ _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
+ (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
+
+ // stage 1.
+ const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
+ const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
+ const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
+ const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
+ const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
+
+ s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
+ s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
+
+ // stage 2.
+ // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
+ const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
+ const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
+ const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
+
+ // stage 3.
+ s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
+ s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
+ s[2] = b7;
+ s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
+
+ // stage 4.
+ s[0] = _mm_add_epi32(s[0], s[5]);
+ s[1] = _mm_sub_epi32(s[1], s[6]);
+
+ // stages 5 and 6.
+ x[0] = _mm_add_epi32(s[0], s[3]);
+ x[1] = _mm_add_epi32(s[1], s[3]);
+ x[2] = _mm_add_epi32(s[0], s[1]);
+ x[3] = _mm_sub_epi32(x[2], s[3]);
+
+ x[0] = RightShiftWithRounding_S32(x[0], 12);
+ x[1] = RightShiftWithRounding_S32(x[1], 12);
+ x[2] = RightShiftWithRounding_S32(s[2], 12);
+ x[3] = RightShiftWithRounding_S32(x[3], 12);
+
+ x[0] = _mm_packs_epi32(x[0], x[1]);
+ x[2] = _mm_packs_epi32(x[2], x[3]);
+ x[1] = _mm_srli_si128(x[0], 8);
+ x[3] = _mm_srli_si128(x[2], 8);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x4To4x8_U16(x, output);
+ StoreDst<8, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ StoreDst<8, 4>(dst, step, 0, x);
+ }
+}
+
+constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
+ 3344, 0, 2482, 1321};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src =
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+ const __m128i v_kAdst4DcOnlyMultipliers =
+ LoadUnaligned16(kAdst4DcOnlyMultiplier);
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ // +
+ // s0*0 s0*0 s0*0 s0*k0
+ const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
+ const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i c = _mm_packs_epi32(b, b);
+ StoreLo8(dst, c);
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int i = 0;
+ do {
+ const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
+ const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
+ const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
+ const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
+ const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
+ const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
+ const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
+ const __m128i x0 = s0;
+ const __m128i x1 = s1;
+ const __m128i x2 = s2;
+ const __m128i x3 = _mm_add_epi32(s0, s1);
+ const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
+ const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
+ const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
+ const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
+ const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
+ StoreLo8(&dst[i], dst_0_1);
+ StoreHi8(&dst[i + width * 1], dst_0_1);
+ StoreLo8(&dst[i + width * 2], dst_2_3);
+ StoreHi8(&dst[i + width * 3], dst_2_3);
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, 0, input);
+ Transpose8x8_U16(input, x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[3], &s[7], false);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(x, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x8_U16(x, output);
+ StoreDst<16, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[8];
+
+ const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ // stage 1.
+ s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ __m128i x[8];
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
+ const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
+ const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
+ const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
+ const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+ const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+ const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+ StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[8];
+
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ __m128i x[8];
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ StoreLo8(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8_U16(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false);
+ HadamardRotation(&s[1], &s[9], false);
+ HadamardRotation(&s[2], &s[10], false);
+ HadamardRotation(&s[3], &s[11], false);
+ HadamardRotation(&s[4], &s[12], false);
+ HadamardRotation(&s[5], &s[13], false);
+ HadamardRotation(&s[6], &s[14], false);
+ HadamardRotation(&s[7], &s[15], false);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[8], &s[12], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[9], &s[13], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[10], &s[14], false);
+ HadamardRotation(&s[3], &s[7], false);
+ HadamardRotation(&s[11], &s[15], false);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[8], &s[10], false);
+ HadamardRotation(&s[12], &s[14], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+ HadamardRotation(&s[9], &s[11], false);
+ HadamardRotation(&s[13], &s[15], false);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[8]);
+ x[2] = s[12];
+ x[3] = _mm_subs_epi16(v_zero, s[4]);
+ x[4] = s[6];
+ x[5] = _mm_subs_epi16(v_zero, s[14]);
+ x[6] = s[10];
+ x[7] = _mm_subs_epi16(v_zero, s[2]);
+ x[8] = s[3];
+ x[9] = _mm_subs_epi16(v_zero, s[11]);
+ x[10] = s[15];
+ x[11] = _mm_subs_epi16(v_zero, s[7]);
+ x[12] = s[5];
+ x[13] = _mm_subs_epi16(v_zero, s[13]);
+ x[14] = s[9];
+ x[15] = _mm_subs_epi16(v_zero, s[1]);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(x, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4_U16(&x[8], output);
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&x[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[8]);
+ x[2] = s[12];
+ x[3] = _mm_subs_epi16(v_zero, s[4]);
+ x[4] = s[6];
+ x[5] = _mm_subs_epi16(v_zero, s[14]);
+ x[6] = s[10];
+ x[7] = _mm_subs_epi16(v_zero, s[2]);
+ x[8] = s[3];
+ x[9] = _mm_subs_epi16(v_zero, s[11]);
+ x[10] = s[15];
+ x[11] = _mm_subs_epi16(v_zero, s[7]);
+ x[12] = s[5];
+ x[13] = _mm_subs_epi16(v_zero, s[13]);
+ x[14] = s[9];
+ x[15] = _mm_subs_epi16(v_zero, s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[16];
+ __m128i x[16];
+
+ const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ // stage 1.
+ s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 2; ++i) {
+ const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
+ const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
+ const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
+ const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
+ const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+ const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+ const __m128i a1 =
+ _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+ StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
+ }
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int i = 0;
+ do {
+ __m128i s[16];
+ __m128i x[16];
+ const __m128i v_src = LoadUnaligned16(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ StoreLo8(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ if (is_row_shift) {
+ const int shift = 1;
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+ for (int i = 0; i < 4; i += 2) {
+ const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+ const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+ const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
+ const __m128i b = _mm_srai_epi32(a, 12 + shift);
+ const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
+ StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
+ }
+ } else {
+ const __m128i v_multiplier =
+ _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
+ for (int i = 0; i < 4; i += 2) {
+ const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+ const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i b = _mm_adds_epi16(a, v_src);
+ StoreUnaligned16(&dst[i * step], b);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
+
+ const int shift = (tx_height < 16) ? 0 : 1;
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+ const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
+ const __m128i b = _mm_srai_epi32(a, 12 + shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+
+ const __m128i v_multiplier_fraction =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+ const __m128i v_eight = _mm_set1_epi16(8);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_mult =
+ _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+
+ const __m128i v_multiplier_fraction =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i v_src_mult2 =
+ _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+ const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+ const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+ const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_adds_epi16(frame_data16, b);
+ Store4(dst, _mm_packus_epi16(c, c));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
+ const __m128i v_src_mult2 =
+ _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+ const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+ const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_adds_epi16(frame_data16, b);
+ StoreLo8(dst + j, _mm_packus_epi16(c, c));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
+ StoreUnaligned16(&dst[h * step], v_src_mult);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const __m128i a = _mm_adds_epi16(v_src, v_src);
+ StoreUnaligned16(&dst[h * step], a);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src =
+ _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
+ const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ const __m128i v_src = LoadLo8(&source[row]);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ const __m128i frame_data = Load4(dst);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+ const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
+ const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
+ const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
+ const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
+ const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
+ const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
+ const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
+ const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
+ const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
+ const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
+ const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
+ const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
+ StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
+ StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round0 =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+ const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+ const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+ const __m128i b = _mm_sra_epi32(a, v_shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const __m128i v_multiplier =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
+ const int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ for (int h = 0; h < 4; ++h) {
+ for (int i = 0; i < 32; i += 8) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ StoreUnaligned16(&dst[h * step + i], v_dst_i);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
+ dst[0] = _mm_extract_epi16(v_dst_0, 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ const __m128i v_two = _mm_set1_epi16(2);
+
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
+ const __m128i b = _mm_srai_epi16(a, 2);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
+ const int start_x, const int start_y,
+ const void* source,
+ const int adjusted_tx_height) {
+ const auto* const src = static_cast<const int16_t*>(source);
+ __m128i s[4], x[4];
+
+ if (adjusted_tx_height == 1) {
+ // Special case: only src[0] is nonzero.
+ // src[0] 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ //
+ // After the row and column transforms are applied, we have:
+ // f h h h
+ // g i i i
+ // g i i i
+ // g i i i
+ // where f, g, h, i are computed as follows.
+ int16_t f = (src[0] >> 2) - (src[0] >> 3);
+ const int16_t g = f >> 1;
+ f = f - (f >> 1);
+ const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+ const int16_t i = (src[0] >> 4);
+ s[0] = _mm_set1_epi16(h);
+ s[0] = _mm_insert_epi16(s[0], f, 0);
+ s[1] = _mm_set1_epi16(i);
+ s[1] = _mm_insert_epi16(s[1], g, 0);
+ s[2] = s[3] = s[1];
+ } else {
+ x[0] = LoadLo8(&src[0 * 4]);
+ x[2] = LoadLo8(&src[1 * 4]);
+ x[3] = LoadLo8(&src[2 * 4]);
+ x[1] = LoadLo8(&src[3 * 4]);
+
+ // Row transforms.
+ Transpose4x4_U16(x, x);
+ s[0] = _mm_srai_epi16(x[0], 2);
+ s[2] = _mm_srai_epi16(x[1], 2);
+ s[3] = _mm_srai_epi16(x[2], 2);
+ s[1] = _mm_srai_epi16(x[3], 2);
+ s[0] = _mm_add_epi16(s[0], s[2]);
+ s[3] = _mm_sub_epi16(s[3], s[1]);
+ __m128i e = _mm_sub_epi16(s[0], s[3]);
+ e = _mm_srai_epi16(e, 1);
+ s[1] = _mm_sub_epi16(e, s[1]);
+ s[2] = _mm_sub_epi16(e, s[2]);
+ s[0] = _mm_sub_epi16(s[0], s[1]);
+ s[3] = _mm_add_epi16(s[3], s[2]);
+ Transpose4x4_U16(s, s);
+
+ // Column transforms.
+ s[0] = _mm_add_epi16(s[0], s[2]);
+ s[3] = _mm_sub_epi16(s[3], s[1]);
+ e = _mm_sub_epi16(s[0], s[3]);
+ e = _mm_srai_epi16(e, 1);
+ s[1] = _mm_sub_epi16(e, s[1]);
+ s[2] = _mm_sub_epi16(e, s[2]);
+ s[0] = _mm_sub_epi16(s[0], s[1]);
+ s[3] = _mm_add_epi16(s[3], s[2]);
+ }
+
+ // Store to frame.
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ for (int row = 0; row < 4; ++row) {
+ const __m128i frame_data = Load4(dst);
+ const __m128i a = _mm_cvtepu8_epi16(frame_data);
+ // Saturate to prevent overflowing int16_t
+ const __m128i b = _mm_adds_epi16(a, s[row]);
+ Store4(dst, _mm_packus_epi16(b, b));
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source,
+ TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ if (tx_width == 4) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const __m128i residual = LoadLo8(&source[row]);
+ const __m128i frame_data = Load4(dst);
+ // Saturate to prevent overflowing int16_t
+ const __m128i a = _mm_adds_epi16(residual, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ }
+ } else if (tx_width == 8) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+ const __m128i residual = LoadUnaligned16(&source[row]);
+ const __m128i frame_data = LoadLo8(dst);
+ // Saturate to prevent overflowing int16_t
+ const __m128i b = _mm_adds_epi16(residual, v_eight);
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i d = _mm_cvtepu8_epi16(frame_data);
+ const __m128i e = _mm_adds_epi16(d, c);
+ StoreLo8(dst, _mm_packus_epi16(e, e));
+ dst += stride;
+ }
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const __m128i residual = LoadUnaligned16(&source[row + j]);
+ const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
+ const __m128i frame_data = LoadUnaligned16(frame[y] + x);
+ const __m128i b = _mm_adds_epi16(residual, v_eight);
+ const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
+ const __m128i d = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
+ const __m128i e = _mm_adds_epi16(d, c);
+ const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
+ StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
+ j += 16;
+ } while (j < tx_width);
+ }
+ }
+}
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+ const __m128i word_reverse_8 =
+ _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ // read 16 shorts
+ const __m128i v3210 = LoadUnaligned16(&source[i]);
+ const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
+ const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
+ const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
+ StoreUnaligned16(&source[i], v4567);
+ StoreUnaligned16(&source[i + 8], v0123);
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
+ StoreUnaligned16(&source[i], b);
+ }
+ } else {
+ const __m128i dual_word_reverse_4 =
+ _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
+ StoreUnaligned16(&source[i], b);
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+ StoreUnaligned16(&source[i], b);
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+ int j = 0;
+ do {
+ const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
+ const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+ StoreUnaligned16(&source[i * tx_width + j], b);
+ j += 8;
+ } while (j < non_zero_width);
+ } while (++i < num_rows);
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+ int row_shift) {
+ const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const __m128i residual = LoadUnaligned16(&source[i]);
+ const __m128i shifted_residual =
+ ShiftResidual(residual, v_row_shift_add, v_row_shift);
+ StoreUnaligned16(&source[i], shifted_residual);
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ for (int j = 0; j < tx_width; j += 8) {
+ const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
+ const __m128i shifted_residual =
+ ShiftResidual(residual, v_row_shift_add, v_row_shift);
+ StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
+ }
+ } while (++i < num_rows);
+ }
+}
+
+void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = static_cast<int>(tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct4 rows in parallel.
+ Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
+ /*transpose=*/true);
+ } else {
+ // Process 8 1d dct4 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct4 columns in parallel.
+ Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
+ /*transpose=*/false);
+ } else {
+ // Process 8 1d dct4 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
+}
+
+void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct8 rows in parallel.
+ Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct8 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct8 columns in parallel.
+ Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d dct8 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
+}
+
+void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct16 rows in parallel.
+ Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d dct16 rows in parallel per iteration.
+ Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ // row_shift is always non zero here.
+ RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct16 columns in parallel.
+ Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d dct16 columns in parallel per iteration.
+ Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
+}
+
+void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct32 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ // row_shift is always non zero here.
+ RowShift<32>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct32 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
+}
+
+void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct64 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ // row_shift is always non zero here.
+ RowShift<64>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct64 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
+}
+
+void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
+ i += 4;
+ } while (i < adjusted_tx_height);
+
+ if (row_shift != 0) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
+ i += 4;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 4, src, tx_type);
+}
+
+void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d adst8 rows in parallel.
+ Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
+ /*transpose=*/true);
+ } else {
+ // Process 8 1d adst8 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst8 columns in parallel.
+ Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d adst8 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 8, src, tx_type);
+}
+
+void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d adst16 rows in parallel.
+ Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d adst16 rows in parallel per iteration.
+ Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ // row_shift is always non zero here.
+ RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst16 columns in parallel.
+ Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d adst16 columns in parallel per iteration.
+ Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 16, src, tx_type);
+}
+
+void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+ if (tx_height < 16) {
+ int i = 0;
+ do {
+ Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ } else {
+ int i = 0;
+ do {
+ Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ }
+}
+
+void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A.
+ if ((tx_height & 0x18) != 0) {
+ return;
+ }
+ if (tx_height == 32) {
+ int i = 0;
+ do {
+ Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = 0;
+ do {
+ Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = 0;
+ do {
+ Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
+ kTransformRowShift[tx_size]);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ const int tx_height = kTransformHeight[tx_size];
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
+ return;
+ }
+
+ // Process kTransformSize32x16. The src is always rounded before the
+ // identity transform and shifted by 1 afterwards.
+ auto* src = static_cast<int16_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = 0;
+ do {
+ Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/,
+ void* /*src_buffer*/, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+ // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
+ // Do both row and column transforms in the column-transform pass.
+ // Process 4 1d wht4 rows and columns in parallel.
+ const auto* src = static_cast<int16_t*>(src_buffer);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+template <typename Residual, typename Pixel>
+void InitAll(Dsp* const dsp) {
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_SSE4_1;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_SSE4_1;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ Identity32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ Identity32TransformLoopColumn_SSE4_1;
+
+ // Maximum transform size for Wht is 4.
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ Wht4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ Wht4TransformLoopColumn_SSE4_1;
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ InitAll<int16_t, uint8_t>(dsp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ Identity32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ Identity32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ Wht4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ Wht4TransformLoopColumn_SSE4_1;
+#endif
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/inverse_transform_sse4.h b/src/dsp/x86/inverse_transform_sse4.h
new file mode 100644
index 0000000..106084b
--- /dev/null
+++ b/src/dsp/x86/inverse_transform_sse4.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc
new file mode 100644
index 0000000..d67b450
--- /dev/null
+++ b/src/dsp/x86/loop_filter_sse4.cc
@@ -0,0 +1,2256 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i FilterAdd2Sub2(const __m128i& total, const __m128i& a1,
+ const __m128i& a2, const __m128i& s1,
+ const __m128i& s2) {
+ __m128i x = _mm_add_epi16(a1, total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(s1, s2)), a2);
+ return x;
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& outer_thresh) {
+ const __m128i fe = _mm_set1_epi8(static_cast<int8_t>(0xfe));
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+ const __m128i a = _mm_adds_epu8(abs_pmq, abs_pmq);
+ const __m128i b = _mm_srli_epi16(_mm_and_si128(abs_pmq, fe), 1);
+ const __m128i c = _mm_adds_epu8(a, _mm_srli_si128(b, 4));
+ return _mm_subs_epu8(c, outer_thresh);
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& hev_thresh) {
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq =
+ _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4));
+ const __m128i hev_mask0 = _mm_cvtepu8_epi16(max_pq);
+ const __m128i hev_mask1 = _mm_cmpgt_epi16(hev_mask0, hev_thresh);
+ const __m128i hev_mask = _mm_packs_epi16(hev_mask1, hev_mask1);
+ return hev_mask;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi8(a, b);
+ const __m128i d = _mm_unpacklo_epi8(c, c);
+ const __m128i e = _mm_srai_epi16(d, 11); /* >> 3 */
+ return _mm_packs_epi16(e, e);
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi8(a, b);
+ const __m128i d = _mm_unpacklo_epi8(c, c);
+ const __m128i e = _mm_srai_epi16(d, 9); /* >> 1 */
+ return _mm_packs_epi16(e, e);
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+ __m128i* oqp0, const __m128i& mask, const __m128i& hev) {
+ const __m128i t80 = _mm_set1_epi8(static_cast<int8_t>(0x80));
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i qps1qps0 = _mm_xor_si128(qp1qp0, t80);
+ const __m128i ps1qs0 = _mm_shuffle_epi32(qps1qps0, 0x09);
+ const __m128i qs1ps0 = _mm_shuffle_epi32(qps1qps0, 0x0c);
+ const __m128i _hev = _mm_unpacklo_epi32(hev, hev);
+ const __m128i x = _mm_subs_epi8(ps1qs0, qs1ps0);
+ __m128i a = _mm_and_si128(_mm_srli_si128(x, 4), _hev);
+
+ a = _mm_adds_epi8(a, x);
+ a = _mm_adds_epi8(a, x);
+ a = _mm_adds_epi8(a, x);
+ a = _mm_and_si128(a, mask);
+ a = _mm_unpacklo_epi32(a, a);
+
+ const __m128i t4t3 = _mm_set_epi32(0x0, 0x0, 0x04040404, 0x03030303);
+ const __m128i a1a2 = AddShift3(a, t4t3);
+ const __m128i a1a1 = _mm_shuffle_epi32(a1a2, 0x55);
+ const __m128i a3a3 = _mm_andnot_si128(_hev, AddShift1(a1a1, t1));
+ // -1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1
+ const __m128i adjust_sign_for_add =
+ _mm_unpacklo_epi32(t1, _mm_cmpeq_epi8(t1, t1));
+
+ const __m128i a3a3a1a2 = _mm_unpacklo_epi64(a1a2, a3a3);
+ const __m128i ma3a3ma1a2 = _mm_sign_epi8(a3a3a1a2, adjust_sign_for_add);
+
+ const __m128i b = _mm_adds_epi8(qps1qps0, ma3a3ma1a2);
+ const __m128i c = _mm_xor_si128(b, t80);
+
+ *oqp0 = c;
+ *oqp1 = _mm_srli_si128(c, 8);
+}
+
+void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh), 0);
+
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose4x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3) {
+ // input
+ // x0 00 01 02 03 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // output
+ // d0 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d1 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ *d0 = _mm_unpacklo_epi16(w0, w1);
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(*d0, 4);
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(*d0, 8);
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(*d0, 12);
+}
+
+void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = Load4(dst - 2 + 0 * stride);
+ __m128i x1 = Load4(dst - 2 + 1 * stride);
+ __m128i x2 = Load4(dst - 2 + 2 * stride);
+ __m128i x3 = Load4(dst - 2 + 3 * stride);
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i d0 = _mm_unpacklo_epi16(w0, w1);
+ const __m128i qp1 = _mm_shuffle_epi32(d0, 0xc);
+ const __m128i qp0 = _mm_srli_si128(d0, 4);
+ const __m128i q1q0 = _mm_srli_si128(d0, 8);
+ const __m128i p1p0 = _mm_shuffle_epi32(d0, 0x1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i p1 = oqp1;
+ const __m128i p0 = oqp0;
+ const __m128i q0 = _mm_srli_si128(oqp0, 4);
+ const __m128i q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+ Store4(dst - 2 + 0 * stride, x0);
+ Store4(dst - 2 + 1 * stride, x1);
+ Store4(dst - 2 + 2 * stride, x2);
+ Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i NeedsFilter6(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i flat_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+ return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+ __m128i* oqp1, __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f6_lo =
+ _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p2 * 3 + p1 * 2 + p0 * 2 + q0
+ // q2 * 3 + q1 * 2 + q0 * 2 + p0
+ *oqp1 = _mm_srli_epi16(f6_lo, 3);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+ // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+ f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f6_lo, 3);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3,
+ __m128i* d4, __m128i* d5, __m128i* d6,
+ __m128i* d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d0 = ww0;
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(ww0, 4);
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(ww0, 8);
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(ww0, 12);
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d4 = ww1;
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d5 = _mm_srli_si128(ww1, 4);
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d6 = _mm_srli_si128(ww1, 8);
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d7 = _mm_srli_si128(ww1, 12);
+}
+
+void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadLo8(dst - 3 + 0 * stride);
+ __m128i x1 = LoadLo8(dst - 3 + 1 * stride);
+ __m128i x2 = LoadLo8(dst - 3 + 2 * stride);
+ __m128i x3 = LoadLo8(dst - 3 + 3 * stride);
+
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i z0, z1; // not used
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 4);
+ q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+ Store4(dst - 2 + 0 * stride, x0);
+ Store4(dst - 2 + 1 * stride, x1);
+ Store4(dst - 2 + 2 * stride, x2);
+ Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+inline __m128i NeedsFilter8(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+ const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq2);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+ const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq0);
+ const __m128i flat_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+ return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f8_lo =
+ _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+ // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+ *oqp2 = _mm_srli_epi16(f8_lo, 3);
+ *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+ // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+ // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+ f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+ *oqp1 = _mm_srli_epi16(f8_lo, 3);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+ // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+ f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+ *oqp0 = _mm_srli_epi16(f8_lo, 3);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p3 = Load4(dst - 4 * stride);
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i q3 = Load4(dst + 3 * stride);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+ Store4(dst - 3 * stride, oqp2_f8);
+ Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ const __m128i w2 = _mm_unpacklo_epi8(x4, x5);
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i w3 = _mm_unpacklo_epi8(x6, x7);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d0 = _mm_unpacklo_epi32(w4, w5);
+ *d1 = _mm_srli_si128(*d0, 8);
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ *d2 = _mm_unpackhi_epi32(w4, w5);
+ *d3 = _mm_srli_si128(*d2, 8);
+}
+
+void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadLo8(dst - 4 + 0 * stride);
+ __m128i x1 = LoadLo8(dst - 4 + 1 * stride);
+ __m128i x2 = LoadLo8(dst - 4 + 2 * stride);
+ __m128i x3 = LoadLo8(dst - 4 + 3 * stride);
+
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ p2 = oqp2_f8;
+ q2 = _mm_srli_si128(oqp2_f8, 4);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 4);
+ q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose8x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+ StoreLo8(dst - 4 + 0 * stride, x0);
+ StoreLo8(dst - 4 + 1 * stride, x1);
+ StoreLo8(dst - 4 + 2 * stride, x2);
+ StoreLo8(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+ __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i qp6_lo = _mm_cvtepu8_epi16(qp6);
+ const __m128i qp5_lo = _mm_cvtepu8_epi16(qp5);
+ const __m128i qp4_lo = _mm_cvtepu8_epi16(qp4);
+ const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+ const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+ const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f14_lo =
+ _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+ _mm_add_epi16(qp5_lo, qp4_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+ _mm_add_epi16(qp3_lo, qp2_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+ // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+ *oqp5 = _mm_srli_epi16(f14_lo, 4);
+ *oqp5 = _mm_packus_epi16(*oqp5, *oqp5);
+
+ // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+ // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+ f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+ *oqp4 = _mm_srli_epi16(f14_lo, 4);
+ *oqp4 = _mm_packus_epi16(*oqp4, *oqp4);
+
+ // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+ // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+ f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+ *oqp3 = _mm_srli_epi16(f14_lo, 4);
+ *oqp3 = _mm_packus_epi16(*oqp3, *oqp3);
+
+ // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+ // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+ f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+ *oqp2 = _mm_srli_epi16(f14_lo, 4);
+ *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+ // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+ // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+ f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+ *oqp1 = _mm_srli_epi16(f14_lo, 4);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+ // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+ f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f14_lo, 4);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p3 = Load4(dst - 4 * stride);
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i q3 = Load4(dst + 3 * stride);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ const __m128i p6 = Load4(dst - 7 * stride);
+ const __m128i p5 = Load4(dst - 6 * stride);
+ const __m128i p4 = Load4(dst - 5 * stride);
+ const __m128i q4 = Load4(dst + 4 * stride);
+ const __m128i q5 = Load4(dst + 5 * stride);
+ const __m128i q6 = Load4(dst + 6 * stride);
+ const __m128i qp6 = _mm_unpacklo_epi32(p6, q6);
+ const __m128i qp5 = _mm_unpacklo_epi32(p5, q5);
+ const __m128i qp4 = _mm_unpacklo_epi32(p4, q4);
+
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask,
+ _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+ Store4(dst - 6 * stride, oqp5_f14);
+ Store4(dst - 5 * stride, oqp4_f14);
+ Store4(dst - 4 * stride, oqp3_f14);
+ Store4(dst + 3 * stride, _mm_srli_si128(oqp3_f14, 4));
+ Store4(dst + 4 * stride, _mm_srli_si128(oqp4_f14, 4));
+ Store4(dst + 5 * stride, _mm_srli_si128(oqp5_f14, 4));
+ }
+
+ Store4(dst - 3 * stride, oqp2_f8);
+ Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+// Each of the 8x4 blocks of input data (p7-p0 and q0-q7) are transposed to 4x8,
+// then unpacked to the correct qp register. (qp7 - qp0)
+//
+// p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+//
+// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+// 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+// 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+// 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+
+inline void DualTranspose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ __m128i* q0p0, __m128i* q1p1, __m128i* q2p2,
+ __m128i* q3p3, __m128i* q4p4, __m128i* q5p5,
+ __m128i* q6p6, __m128i* q7p7) {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+ const __m128i w2 = _mm_unpackhi_epi8(x0, x1);
+ // 28 38 29 39 2a 3a 2b 3b 2c 3c 2d 3d 2e 3e 2f 3f
+ const __m128i w3 = _mm_unpackhi_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+ // 08 18 28 38 09 19 29 39 0a 1a 2a 3a 0b 1b 2b 3b
+ const __m128i ww2 = _mm_unpacklo_epi16(w2, w3);
+ // 0c 1c 2c 3c 0d 1d 2d 3d 0e 1e 2e 3e 0f 1f 2f 3f
+ const __m128i ww3 = _mm_unpackhi_epi16(w2, w3);
+ // 00 10 20 30 0f 1f 2f 3f xx xx xx xx xx xx xx xx
+ *q7p7 = _mm_unpacklo_epi32(ww0, _mm_srli_si128(ww3, 12));
+ // 01 11 21 31 0e 1e 2e 3e xx xx xx xx xx xx xx xx
+ *q6p6 = _mm_unpackhi_epi32(_mm_slli_si128(ww0, 4), ww3);
+ // 02 12 22 32 0d 1d 2d 3d xx xx xx xx xx xx xx xx
+ *q5p5 = _mm_unpackhi_epi32(ww0, _mm_slli_si128(ww3, 4));
+ // 03 13 23 33 0c 1c 2c 3c xx xx xx xx xx xx xx xx
+ *q4p4 = _mm_unpacklo_epi32(_mm_srli_si128(ww0, 12), ww3);
+ // 04 14 24 34 0b 1b 2b 3b xx xx xx xx xx xx xx xx
+ *q3p3 = _mm_unpacklo_epi32(ww1, _mm_srli_si128(ww2, 12));
+ // 05 15 25 35 0a 1a 2a 3a xx xx xx xx xx xx xx xx
+ *q2p2 = _mm_unpackhi_epi32(_mm_slli_si128(ww1, 4), ww2);
+ // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ *q1p1 = _mm_unpackhi_epi32(ww1, _mm_slli_si128(ww2, 4));
+ // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+ *q0p0 = _mm_unpacklo_epi32(_mm_srli_si128(ww1, 12), ww2);
+}
+
+inline void DualTranspose4x8To8x4(const __m128i& qp7, const __m128i& qp6,
+ const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ __m128i* x0, __m128i* x1, __m128i* x2,
+ __m128i* x3) {
+ // qp7: 00 10 20 30 0f 1f 2f 3f xx xx xx xx xx xx xx xx
+ // qp6: 01 11 21 31 0e 1e 2e 3e xx xx xx xx xx xx xx xx
+ // qp5: 02 12 22 32 0d 1d 2d 3d xx xx xx xx xx xx xx xx
+ // qp4: 03 13 23 33 0c 1c 2c 3c xx xx xx xx xx xx xx xx
+ // qp3: 04 14 24 34 0b 1b 2b 3b xx xx xx xx xx xx xx xx
+ // qp2: 05 15 25 35 0a 1a 2a 3a xx xx xx xx xx xx xx xx
+ // qp1: 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ // qp0: 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+
+ // 00 01 10 11 20 21 30 31 0f 0e 1f 1e 2f 2e 3f 3e
+ const __m128i w0 = _mm_unpacklo_epi8(qp7, qp6);
+ // 02 03 12 13 22 23 32 33 xx xx xx xx xx xx xx xx
+ const __m128i w1 = _mm_unpacklo_epi8(qp5, qp4);
+ // 04 05 14 15 24 25 34 35 xx xx xx xx xx xx xx xx
+ const __m128i w2 = _mm_unpacklo_epi8(qp3, qp2);
+ // 06 07 16 17 26 27 36 37 xx xx xx xx xx xx xx xx
+ const __m128i w3 = _mm_unpacklo_epi8(qp1, qp0);
+ // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+ const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+ // 04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37
+ const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ const __m128i d0 = _mm_unpacklo_epi32(w4, w5);
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ const __m128i d2 = _mm_unpackhi_epi32(w4, w5);
+ // xx xx xx xx xx xx xx xx 08 09 18 19 28 29 38 39
+ const __m128i w10 = _mm_unpacklo_epi8(qp0, qp1);
+ // xx xx xx xx xx xx xx xx 0a 0b 1a 1b 2a 2b 3a 3b
+ const __m128i w11 = _mm_unpacklo_epi8(qp2, qp3);
+ // xx xx xx xx xx xx xx xx 0c 0d 1c 1d 2c 2d 3c 3d
+ const __m128i w12 = _mm_unpacklo_epi8(qp4, qp5);
+ // xx xx xx xx xx xx xx xx 0e 0f 1e 1f 2e 2f 3e 3f
+ const __m128i w13 = _mm_unpacklo_epi8(qp6, qp7);
+ // 08 09 0a 0b 18 19 1a 1b 28 29 2a 2b 38 39 3a 3b
+ const __m128i w14 = _mm_unpackhi_epi16(w10, w11);
+ // 0c 0d 0e 0f 1c 1d 1e 1f 2c 2d 2e 2f 3c 3d 3e 3f
+ const __m128i w15 = _mm_unpackhi_epi16(w12, w13);
+ // 08 09 0a 0b 0c 0d 0e 0f 18 19 1a 1b 1c 1d 1e 1f
+ const __m128i d1 = _mm_unpacklo_epi32(w14, w15);
+ // 28 29 2a 2b 2c 2d 2e 2f 38 39 3a 3b 3c 3d 3e 3f
+ const __m128i d3 = _mm_unpackhi_epi32(w14, w15);
+
+ // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+ //
+ // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+ *x0 = _mm_unpacklo_epi64(d0, d1);
+ // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+ *x1 = _mm_unpackhi_epi64(d0, d1);
+ // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+ *x2 = _mm_unpacklo_epi64(d2, d3);
+ // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+ *x3 = _mm_unpackhi_epi64(d2, d3);
+}
+
+void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+ __m128i qp7, qp6, qp5, qp4, qp3, qp2, qp1, qp0;
+
+ DualTranspose8x4To4x8(x0, x1, x2, x3, &qp0, &qp1, &qp2, &qp3, &qp4, &qp5,
+ &qp6, &qp7);
+
+ const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i q1q0 = _mm_shuffle_epi32(qp1qp0, 0x0d);
+ const __m128i p1p0 = _mm_shuffle_epi32(qp1qp0, 0x08);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask,
+ _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+ qp3 = oqp3_f14;
+ qp4 = oqp4_f14;
+ qp5 = oqp5_f14;
+ }
+ qp2 = oqp2_f8;
+ }
+
+ DualTranspose4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, &x2,
+ &x3);
+
+ StoreUnaligned16(dst - 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 3 * stride, x3);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = Horizontal4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = Horizontal6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = Horizontal8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Horizontal14;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14;
+#endif
+}
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+namespace high_bitdepth {
+namespace {
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+struct LoopFilterFuncs_SSE4_1 {
+ LoopFilterFuncs_SSE4_1() = delete;
+
+ static constexpr int kThreshShift = bitdepth - 8;
+
+ static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+};
+
+inline __m128i Clamp(const __m128i& min, const __m128i& max,
+ const __m128i& val) {
+ const __m128i a = _mm_min_epi16(val, max);
+ const __m128i b = _mm_max_epi16(a, min);
+ return b;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b,
+ const __m128i& vmin, const __m128i& vmax) {
+ const __m128i c = _mm_adds_epi16(a, b);
+ const __m128i d = Clamp(vmin, vmax, c);
+ const __m128i e = _mm_srai_epi16(d, 3); /* >> 3 */
+ return e;
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi16(a, b);
+ const __m128i e = _mm_srai_epi16(c, 1); /* >> 1 */
+ return e;
+}
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+ return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& hev_thresh) {
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq =
+ _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+ const __m128i hev_mask = _mm_cmpgt_epi16(max_pq, hev_thresh);
+ return hev_mask;
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& outer_thresh) {
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+ const __m128i a = _mm_adds_epu16(abs_pmq, abs_pmq);
+ const __m128i b = _mm_srli_epi16(abs_pmq, 1);
+ const __m128i c = _mm_adds_epu16(a, _mm_srli_si128(b, 8));
+ return _mm_subs_epu16(c, outer_thresh);
+}
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_abs_qp1mqp =
+ _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+ const __m128i inner_mask = _mm_subs_epu16(max_abs_qp1mqp, inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+ __m128i* oqp0, const __m128i& mask, const __m128i& hev,
+ int bitdepth) {
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ const __m128i t80 = _mm_set1_epi16(static_cast<int16_t>(1 << (bitdepth - 1)));
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ const __m128i vmin = _mm_subs_epi16(_mm_setzero_si128(), t80);
+ const __m128i vmax = _mm_subs_epi16(t80, t1);
+ const __m128i ps1 = _mm_subs_epi16(qp1, t80);
+ const __m128i ps0 = _mm_subs_epi16(qp0, t80);
+ const __m128i qs0 = _mm_srli_si128(ps0, 8);
+ const __m128i qs1 = _mm_srli_si128(ps1, 8);
+
+ __m128i a = _mm_subs_epi16(ps1, qs1);
+ a = _mm_and_si128(Clamp(vmin, vmax, a), hev);
+
+ const __m128i x = _mm_subs_epi16(qs0, ps0);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_and_si128(Clamp(vmin, vmax, a), mask);
+
+ const __m128i a1 = AddShift3(a, t4, vmin, vmax);
+ const __m128i a2 = AddShift3(a, t3, vmin, vmax);
+ const __m128i a3 = _mm_andnot_si128(hev, AddShift1(a1, t1));
+
+ const __m128i ops1 = _mm_adds_epi16(ps1, a3);
+ const __m128i ops0 = _mm_adds_epi16(ps0, a2);
+ const __m128i oqs0 = _mm_subs_epi16(qs0, a1);
+ const __m128i oqs1 = _mm_subs_epi16(qs1, a3);
+
+ __m128i oqps1 = _mm_unpacklo_epi64(ops1, oqs1);
+ __m128i oqps0 = _mm_unpacklo_epi64(ops0, oqs0);
+
+ oqps1 = Clamp(vmin, vmax, oqps1);
+ oqps0 = Clamp(vmin, vmax, oqps0);
+
+ *oqp1 = _mm_adds_epi16(oqps1, t80);
+ *oqp0 = _mm_adds_epi16(oqps0, t80);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal4(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i qp0 = LoadHi8(p0, dst + 0 * stride);
+ const __m128i qp1 = LoadHi8(p1, dst + 1 * stride);
+ const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+ const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical4(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+ const __m128i x0 = LoadLo8(dst - 2 + 0 * stride);
+ const __m128i x1 = LoadLo8(dst - 2 + 1 * stride);
+ const __m128i x2 = LoadLo8(dst - 2 + 2 * stride);
+ const __m128i x3 = LoadLo8(dst - 2 + 3 * stride);
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 01 11 21 31 p0p1
+ const __m128i a = _mm_unpacklo_epi32(w0, w1);
+ const __m128i p1p0 = _mm_shuffle_epi32(a, 0x4e);
+ // 02 12 22 32 03 13 23 33 q1q0
+ const __m128i q1q0 = _mm_unpackhi_epi32(w0, w1);
+ const __m128i qp1 = _mm_unpackhi_epi64(p1p0, q1q0);
+ const __m128i qp0 = _mm_unpacklo_epi64(p1p0, q1q0);
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+ // 00 10 20 30 01 11 21 31
+ const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+ StoreLo8(dst - 2 + 0 * stride, op0p1);
+ StoreHi8(dst - 2 + 1 * stride, op0p1);
+ StoreLo8(dst - 2 + 2 * stride, oq1q0);
+ StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i CheckOuterThreshF6(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh) {
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+ const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+ return CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+}
+
+inline __m128i NeedsFilter6(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i inner_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i flat_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+ return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+ __m128i* oqp1, __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f6_lo;
+ f6_lo =
+ _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p2 * 3 + p1 * 2 + p0 * 2 + q0
+ // q2 * 3 + q1 * 2 + q0 * 2 + p0
+ *oqp1 = _mm_srli_epi16(f6_lo, 3);
+
+ // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+ // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+ f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f6_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3,
+ __m128i* d4, __m128i* d5, __m128i* d6,
+ __m128i* d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // output
+ // 00 10 20 30 xx xx xx xx
+ // 01 11 21 31 xx xx xx xx
+ // 02 12 22 32 xx xx xx xx
+ // 03 13 23 33 xx xx xx xx
+ // 04 14 24 34 xx xx xx xx
+ // 05 15 25 35 xx xx xx xx
+ // 06 16 26 36 xx xx xx xx
+ // 07 17 27 37 xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 04 14 05 15 06 16 07 17
+ const __m128i w2 = _mm_unpackhi_epi16(x0, x1);
+ // 24 34 25 35 26 36 27 37
+ const __m128i w3 = _mm_unpackhi_epi16(x2, x3);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i ww0 = _mm_unpacklo_epi32(w0, w1);
+ // 04 14 24 34 05 15 25 35
+ const __m128i ww1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i ww2 = _mm_unpackhi_epi32(w0, w1);
+ // 06 16 26 36 07 17 27 37
+ const __m128i ww3 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 xx xx xx xx
+ *d0 = ww0;
+ // 01 11 21 31 xx xx xx xx
+ *d1 = _mm_srli_si128(ww0, 8);
+ // 02 12 22 32 xx xx xx xx
+ *d2 = ww2;
+ // 03 13 23 33 xx xx xx xx
+ *d3 = _mm_srli_si128(ww2, 8);
+ // 04 14 24 34 xx xx xx xx
+ *d4 = ww1;
+ // 05 15 25 35 xx xx xx xx
+ *d5 = _mm_srli_si128(ww1, 8);
+ // 06 16 26 36 xx xx xx xx
+ *d6 = ww3;
+ // 07 17 27 37 xx xx xx xx
+ *d7 = _mm_srli_si128(ww3, 8);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ __m128i x0 = LoadUnaligned16(dst - 3 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 3 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 3 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 3 + 3 * stride);
+
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i z0, z1; // not used
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+ // 00 10 20 30 01 11 21 31
+ const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+ StoreLo8(dst - 2 + 0 * stride, op0p1);
+ StoreHi8(dst - 2 + 1 * stride, op0p1);
+ StoreLo8(dst - 2 + 2 * stride, oq1q0);
+ StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+inline __m128i NeedsFilter8(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+ const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq2);
+ const __m128i inner_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+ const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq0);
+ const __m128i flat_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+ return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp3_lo = qp3;
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f8_lo =
+ _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+ // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+ *oqp2 = _mm_srli_epi16(f8_lo, 3);
+
+ // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+ // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+ f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+ *oqp1 = _mm_srli_epi16(f8_lo, 3);
+
+ // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+ // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+ f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+ *oqp0 = _mm_srli_epi16(f8_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p3 = LoadLo8(dst - 4 * stride);
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+ const __m128i q3 = LoadLo8(dst + 3 * stride);
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+ StoreLo8(dst - 3 * stride, oqp2_f8);
+ StoreHi8(dst + 2 * stride, oqp2_f8);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeLower4x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7,
+ __m128i* d0, __m128i* d1, __m128i* d2,
+ __m128i* d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70
+ // d1 01 11 21 31 41 51 61 71
+ // d2 02 12 22 32 42 52 62 72
+ // d3 03 13 23 33 43 53 63 73
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 40 50 41 51 42 52 43 53
+ const __m128i w2 = _mm_unpacklo_epi16(x4, x5);
+ // 60 70 61 71 62 72 63 73
+ const __m128i w3 = _mm_unpacklo_epi16(x6, x7);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+ // 40 50 60 70 41 51 61 71
+ const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+ // 42 52 62 72 43 53 63 73
+ const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 40 50 60 70
+ *d0 = _mm_unpacklo_epi64(w4, w5);
+ // 01 11 21 31 41 51 61 71
+ *d1 = _mm_unpackhi_epi64(w4, w5);
+ // 02 12 22 32 42 52 62 72
+ *d2 = _mm_unpacklo_epi64(w6, w7);
+ // 03 13 23 33 43 53 63 73
+ *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ __m128i x0 = LoadUnaligned16(dst - 4 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 4 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 4 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 4 + 3 * stride);
+
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ p2 = oqp2_f8;
+ q2 = _mm_srli_si128(oqp2_f8, 8);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 8);
+ q1 = _mm_srli_si128(oqp1, 8);
+
+ TransposeLower4x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+ StoreUnaligned16(dst - 4 + 0 * stride, x0);
+ StoreUnaligned16(dst - 4 + 1 * stride, x1);
+ StoreUnaligned16(dst - 4 + 2 * stride, x2);
+ StoreUnaligned16(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+ __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i qp6_lo = qp6;
+ const __m128i qp5_lo = qp5;
+ const __m128i qp4_lo = qp4;
+ const __m128i qp3_lo = qp3;
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+ const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+ const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f14_lo =
+ _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+ _mm_add_epi16(qp5_lo, qp4_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+ _mm_add_epi16(qp3_lo, qp2_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+ // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+ *oqp5 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+ // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+ f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+ *oqp4 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+ // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+ f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+ *oqp3 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+ // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+ f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+ *oqp2 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+ // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+ f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+ *oqp1 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+ // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+ f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f14_lo, 4);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p3 = LoadLo8(dst - 4 * stride);
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+ const __m128i q3 = LoadLo8(dst + 3 * stride);
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ const __m128i p6 = LoadLo8(dst - 7 * stride);
+ const __m128i p5 = LoadLo8(dst - 6 * stride);
+ const __m128i p4 = LoadLo8(dst - 5 * stride);
+ const __m128i q4 = LoadLo8(dst + 4 * stride);
+ const __m128i q5 = LoadLo8(dst + 5 * stride);
+ const __m128i q6 = LoadLo8(dst + 6 * stride);
+ const __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+ const __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+ const __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+ const __m128i v_flat4_mask =
+ _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask,
+ _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+ StoreLo8(dst - 6 * stride, oqp5_f14);
+ StoreLo8(dst - 5 * stride, oqp4_f14);
+ StoreLo8(dst - 4 * stride, oqp3_f14);
+
+ StoreHi8(dst + 3 * stride, oqp3_f14);
+ StoreHi8(dst + 4 * stride, oqp4_f14);
+ StoreHi8(dst + 5 * stride, oqp5_f14);
+ }
+
+ StoreLo8(dst - 3 * stride, oqp2_f8);
+ StoreHi8(dst + 2 * stride, oqp2_f8);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeUpper4x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7,
+ __m128i* d0, __m128i* d1, __m128i* d2,
+ __m128i* d3) {
+ // input
+ // x0 00 01 02 03 xx xx xx xx
+ // x1 10 11 12 13 xx xx xx xx
+ // x2 20 21 22 23 xx xx xx xx
+ // x3 30 31 32 33 xx xx xx xx
+ // x4 40 41 42 43 xx xx xx xx
+ // x5 50 51 52 53 xx xx xx xx
+ // x6 60 61 62 63 xx xx xx xx
+ // x7 70 71 72 73 xx xx xx xx
+ // output
+ // d0 00 10 20 30 40 50 60 70
+ // d1 01 11 21 31 41 51 61 71
+ // d2 02 12 22 32 42 52 62 72
+ // d3 03 13 23 33 43 53 63 73
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpackhi_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpackhi_epi16(x2, x3);
+ // 40 50 41 51 42 52 43 53
+ const __m128i w2 = _mm_unpackhi_epi16(x4, x5);
+ // 60 70 61 71 62 72 63 73
+ const __m128i w3 = _mm_unpackhi_epi16(x6, x7);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+ // 40 50 60 70 41 51 61 71
+ const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+ // 42 52 62 72 43 53 63 73
+ const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 40 50 60 70
+ *d0 = _mm_unpacklo_epi64(w4, w5);
+ // 01 11 21 31 41 51 61 71
+ *d1 = _mm_unpackhi_epi64(w4, w5);
+ // 02 12 22 32 42 52 62 72
+ *d2 = _mm_unpacklo_epi64(w6, w7);
+ // 03 13 23 33 43 53 63 73
+ *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+ //
+ // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+ // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+ // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+ // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+
+ __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0;
+ __m128i q7, q6, q5, q4, q3, q2, q1, q0;
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+
+ x0 = LoadUnaligned16(dst - 8 + 8 + 0 * stride);
+ x1 = LoadUnaligned16(dst - 8 + 8 + 1 * stride);
+ x2 = LoadUnaligned16(dst - 8 + 8 + 2 * stride);
+ x3 = LoadUnaligned16(dst - 8 + 8 + 3 * stride);
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+
+ __m128i qp7 = _mm_unpacklo_epi64(p7, q7);
+ __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+ __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+ __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+ __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+ const __m128i v_flat4_mask =
+ _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask,
+ _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+ qp3 = oqp3_f14;
+ qp4 = oqp4_f14;
+ qp5 = oqp5_f14;
+ }
+ qp2 = oqp2_f8;
+ }
+
+ TransposeLower4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1,
+ &x2, &x3);
+
+ StoreUnaligned16(dst - 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 3 * stride, x3);
+
+ TransposeUpper4x8To8x4(oqp0, oqp1, qp2, qp3, qp4, qp5, qp6, qp7, &x0, &x1,
+ &x2, &x3);
+
+ StoreUnaligned16(dst - 8 + 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 8 + 3 * stride, x3);
+}
+
+using Defs10bpp = LoopFilterFuncs_SSE4_1<kBitdepth10>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal14;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical14;
+#endif
+}
+#endif
+} // namespace
+} // namespace high_bitdepth
+
+void LoopFilterInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_filter_sse4.h b/src/dsp/x86/loop_filter_sse4.h
new file mode 100644
index 0000000..4795d8b
--- /dev/null
+++ b/src/dsp/x86/loop_filter_sse4.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc
new file mode 100644
index 0000000..702bdea
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc
@@ -0,0 +1,592 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2],
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const __m256i offsets = _mm256_set1_epi16(-offset);
+ const __m256i limits = _mm256_set1_epi16(limit - offset);
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+ const __m256i sum0 = _mm256_add_epi32(s[0], round);
+ const __m256i sum1 = _mm256_add_epi32(s[1], round);
+ const __m256i rounded_sum0 =
+ _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal);
+ const __m256i rounded_sum1 =
+ _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal);
+ const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1);
+ const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets);
+ const __m256i d1 = _mm256_min_epi16(d0, limits);
+ StoreAligned32(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m256i s[7],
+ const __m256i filter[2],
+ int16_t* const wiener_buffer) {
+ const __m256i s06 = _mm256_add_epi16(s[0], s[6]);
+ const __m256i s15 = _mm256_add_epi16(s[1], s[5]);
+ const __m256i s24 = _mm256_add_epi16(s[2], s[4]);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15);
+ const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]);
+ const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]);
+ __m256i madds[4];
+ madds[0] = _mm256_madd_epi16(ss0, filter[0]);
+ madds[1] = _mm256_madd_epi16(ss1, filter[0]);
+ madds[2] = _mm256_madd_epi16(ss2, filter[1]);
+ madds[3] = _mm256_madd_epi16(ss3, filter[1]);
+ madds[0] = _mm256_add_epi32(madds[0], madds[2]);
+ madds[1] = _mm256_add_epi32(madds[1], madds[3]);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter,
+ int16_t* const wiener_buffer) {
+ const __m256i s04 = _mm256_add_epi16(s[0], s[4]);
+ const __m256i s13 = _mm256_add_epi16(s[1], s[3]);
+ const __m256i s2d = _mm256_add_epi16(s[2], s[2]);
+ const __m256i s0m = _mm256_sub_epi16(s04, s2d);
+ const __m256i s1m = _mm256_sub_epi16(s13, s2d);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m);
+ __m256i madds[2];
+ madds[0] = _mm256_madd_epi16(ss0, filter);
+ madds[1] = _mm256_madd_epi16(ss1, filter);
+ const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
+ const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
+ const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7);
+ const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7);
+ madds[0] = _mm256_add_epi32(madds[0], s2x128_lo);
+ madds[1] = _mm256_add_epi32(madds[1], s2x128_hi);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter,
+ int16_t* const wiener_buffer) {
+ const __m256i s02 = _mm256_add_epi16(s[0], s[2]);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]);
+ __m256i madds[2];
+ madds[0] = _mm256_madd_epi16(ss0, filter);
+ madds[1] = _mm256_madd_epi16(ss1, filter);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0);
+ filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[7];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ s[3] = LoadUnaligned32(src + x + 3);
+ s[4] = LoadUnaligned32(src + x + 4);
+ s[5] = LoadUnaligned32(src + x + 5);
+ s[6] = LoadUnaligned32(src + x + 6);
+ WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ const __m256i filter =
+ _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[5];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ s[3] = LoadUnaligned32(src + x + 3);
+ s[4] = LoadUnaligned32(src + x + 4);
+ WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[3];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m256i s0 = LoadUnaligned32(src + x);
+ const __m256i d0 = _mm256_slli_epi16(s0, 4);
+ StoreAligned32(*wiener_buffer + x, d0);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+ const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]);
+ const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+ const __m256i madd23 = _mm256_add_epi32(madd2, madd3);
+ const __m256i sum = _mm256_add_epi32(madd01, madd23);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+ const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+ const __m256i sum = _mm256_add_epi32(madd01, madd2);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum = _mm256_add_epi32(madd0, madd1);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalClip(const __m256i s[2]) {
+ const __m256i d = _mm256_packus_epi32(s[0], s[1]);
+ return _mm256_min_epu16(d, _mm256_set1_epi16(1023));
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[4], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm256_unpacklo_epi16(a[4], a[5]);
+ b[3] = _mm256_unpacklo_epi16(a[6], round);
+ c[0] = WienerVertical7(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm256_unpackhi_epi16(a[4], a[5]);
+ b[3] = _mm256_unpackhi_epi16(a[6], round);
+ c[1] = WienerVertical7(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+ const __m256i filter[3]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[3], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm256_unpacklo_epi16(a[4], round);
+ c[0] = WienerVertical5(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm256_unpackhi_epi16(a[4], round);
+ c[1] = WienerVertical5(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[2], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], round);
+ c[0] = WienerVertical3(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], round);
+ c[1] = WienerVertical3(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[7]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[3], __m256i a[5]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[3]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[3], __m256i d[2]) {
+ __m256i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[4];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi32(c, 0x55);
+ filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+ filter[3] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d);
+ StoreUnaligned32(dst + x, d[0]);
+ StoreUnaligned32(dst + dst_stride + x, d[1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[7];
+ const __m256i d =
+ WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[3];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+ filter[2] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d);
+ StoreUnaligned32(dst + x, d[0]);
+ StoreUnaligned32(dst + dst_stride + x, d[1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[5];
+ const __m256i d =
+ WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ __m256i filter[2];
+ filter[0] =
+ _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ filter[1] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]);
+ StoreUnaligned32(dst + x, d[0][0]);
+ StoreUnaligned32(dst + dst_stride + x, d[0][1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[3];
+ const __m256i d =
+ WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const __m256i a = LoadAligned32(wiener_buffer);
+ const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8));
+ const __m256i c = _mm256_srai_epi16(b, 4);
+ const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256());
+ const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023));
+ StoreUnaligned32(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border, const ptrdiff_t stride,
+ const int width, const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ const __m128i c =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
new file mode 100644
index 0000000..0598435
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -0,0 +1,551 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2],
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const __m128i offsets = _mm_set1_epi16(-offset);
+ const __m128i limits = _mm_set1_epi16(limit - offset);
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+ const __m128i sum0 = _mm_add_epi32(s[0], round);
+ const __m128i sum1 = _mm_add_epi32(s[1], round);
+ const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
+ const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
+ const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
+ const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
+ const __m128i d1 = _mm_min_epi16(d0, limits);
+ StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
+ filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[7], madds[4];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ s[5] = LoadUnaligned16(src + x + 5);
+ s[6] = LoadUnaligned16(src + x + 6);
+ const __m128i s06 = _mm_add_epi16(s[0], s[6]);
+ const __m128i s15 = _mm_add_epi16(s[1], s[5]);
+ const __m128i s24 = _mm_add_epi16(s[2], s[4]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
+ const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
+ const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
+ const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
+ madds[0] = _mm_madd_epi16(ss0, filter[0]);
+ madds[1] = _mm_madd_epi16(ss1, filter[0]);
+ madds[2] = _mm_madd_epi16(ss2, filter[1]);
+ madds[3] = _mm_madd_epi16(ss3, filter[1]);
+ madds[0] = _mm_add_epi32(madds[0], madds[2]);
+ madds[1] = _mm_add_epi32(madds[1], madds[3]);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i filter =
+ _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[5], madds[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ const __m128i s04 = _mm_add_epi16(s[0], s[4]);
+ const __m128i s13 = _mm_add_epi16(s[1], s[3]);
+ const __m128i s2d = _mm_add_epi16(s[2], s[2]);
+ const __m128i s0m = _mm_sub_epi16(s04, s2d);
+ const __m128i s1m = _mm_sub_epi16(s13, s2d);
+ const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
+ const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
+ madds[0] = _mm_madd_epi16(ss0, filter);
+ madds[1] = _mm_madd_epi16(ss1, filter);
+ const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
+ const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
+ const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
+ const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
+ madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
+ madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[3], madds[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ const __m128i s02 = _mm_add_epi16(s[0], s[2]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
+ const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
+ madds[0] = _mm_madd_epi16(ss0, filter);
+ madds[1] = _mm_madd_epi16(ss1, filter);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m128i s = LoadUnaligned16(src + x);
+ const __m128i d = _mm_slli_epi16(s, 4);
+ StoreAligned16(*wiener_buffer + x, d);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+ const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
+ const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+ const __m128i madd23 = _mm_add_epi32(madd2, madd3);
+ const __m128i sum = _mm_add_epi32(madd01, madd23);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+ const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+ const __m128i sum = _mm_add_epi32(madd01, madd2);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum = _mm_add_epi32(madd0, madd1);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalClip(const __m128i s[2]) {
+ const __m128i d = _mm_packus_epi32(s[0], s[1]);
+ return _mm_min_epu16(d, _mm_set1_epi16(1023));
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[4], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm_unpacklo_epi16(a[4], a[5]);
+ b[3] = _mm_unpacklo_epi16(a[6], round);
+ c[0] = WienerVertical7(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm_unpackhi_epi16(a[4], a[5]);
+ b[3] = _mm_unpackhi_epi16(a[6], round);
+ c[1] = WienerVertical7(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+ const __m128i filter[3]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[3], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm_unpacklo_epi16(a[4], round);
+ c[0] = WienerVertical5(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm_unpackhi_epi16(a[4], round);
+ c[1] = WienerVertical5(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[2], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], round);
+ c[0] = WienerVertical3(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], round);
+ c[1] = WienerVertical3(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[7]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[3], __m128i a[5]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[3]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[4];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi32(c, 0x55);
+ filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+ filter[3] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[8], d[2];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[7];
+ const __m128i d =
+ WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[3];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+ filter[2] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[6], d[2];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[5];
+ const __m128i d =
+ WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ __m128i filter[2];
+ filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ filter[1] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[4], d[2];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[3];
+ const __m128i d =
+ WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const __m128i a = LoadAligned16(wiener_buffer);
+ const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
+ const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
+ StoreAligned16(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ const __m128i coefficients_horizontal =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+ static_cast<void>(WienerFilter_SSE4_1);
+#endif
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc
new file mode 100644
index 0000000..7ae7c90
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_avx2.cc
@@ -0,0 +1,2902 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128,
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit =
+ (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+ const __m256i offsets = _mm256_set1_epi16(-offset);
+ const __m256i limits = _mm256_set1_epi16(limit - offset);
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
+ // The sum range here is [-128 * 255, 90 * 255].
+ const __m256i madd = _mm256_add_epi16(s[0], s[1]);
+ const __m256i sum = _mm256_add_epi16(madd, round);
+ const __m256i rounded_sum0 =
+ _mm256_srai_epi16(sum, kInterRoundBitsHorizontal);
+ // Add back scaled down offset correction.
+ const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128);
+ const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets);
+ const __m256i d1 = _mm256_min_epi16(d0, limits);
+ StoreAligned32(wiener_buffer, d1);
+}
+
+// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking,
+// because the compiler generates redundant code when loading all and unpacking.
+inline void WienerHorizontalTap7Kernel(const __m256i s[2],
+ const __m256i filter[4],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+ const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13);
+ __m256i madds[4];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+ madds[3] = _mm256_maddubs_epi16(s67, filter[3]);
+ madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+ madds[1] = _mm256_add_epi16(madds[1], madds[3]);
+ const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8),
+ 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[2],
+ const __m256i filter[3],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+ __m256i madds[3];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+ madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+ const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8),
+ kInterRoundBitsHorizontal + 1);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[2],
+ const __m256i filter[2],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ __m256i madds[2];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8),
+ 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[4];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+ filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
+ filter[3] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8000));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[3];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
+ filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8001));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8002));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m256i s = LoadUnaligned32(src + x);
+ const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256());
+ __m256i d[2];
+ d[0] = _mm256_slli_epi16(s0, 4);
+ d[1] = _mm256_slli_epi16(s1, 4);
+ StoreAligned64(*wiener_buffer + x, d);
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum0 = _mm256_add_epi32(round, madd0);
+ const __m256i sum1 = _mm256_add_epi32(sum0, madd1);
+ return _mm256_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum = _mm256_add_epi32(madd0, madd1);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a, const __m256i filter) {
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m256i madd = _mm256_madd_epi16(a, filter);
+ const __m256i sum = _mm256_add_epi32(round, madd);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+ const __m256i filter[2]) {
+ __m256i b[2];
+ const __m256i a06 = _mm256_add_epi16(a[0], a[6]);
+ const __m256i a15 = _mm256_add_epi16(a[1], a[5]);
+ const __m256i a24 = _mm256_add_epi16(a[2], a[4]);
+ b[0] = _mm256_unpacklo_epi16(a06, a15);
+ b[1] = _mm256_unpacklo_epi16(a24, a[3]);
+ const __m256i sum0 = WienerVertical7(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a06, a15);
+ b[1] = _mm256_unpackhi_epi16(a24, a[3]);
+ const __m256i sum1 = WienerVertical7(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[2];
+ const __m256i a04 = _mm256_add_epi16(a[0], a[4]);
+ const __m256i a13 = _mm256_add_epi16(a[1], a[3]);
+ b[0] = _mm256_unpacklo_epi16(a04, a13);
+ b[1] = _mm256_unpacklo_epi16(a[2], round);
+ const __m256i sum0 = WienerVertical5(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a04, a13);
+ b[1] = _mm256_unpackhi_epi16(a[2], round);
+ const __m256i sum1 = WienerVertical5(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) {
+ __m256i b;
+ const __m256i a02 = _mm256_add_epi16(a[0], a[2]);
+ b = _mm256_unpacklo_epi16(a02, a[1]);
+ const __m256i sum0 = WienerVertical3(b, filter);
+ b = _mm256_unpackhi_epi16(a02, a[1]);
+ const __m256i sum1 = WienerVertical3(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[7]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[5]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter, __m256i a[3]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter, __m256i d[2]) {
+ __m256i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi32(c, 0x55);
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[7];
+ const __m256i d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients));
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(c, 0);
+ filter[1] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[5];
+ const __m256i d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i filter =
+ _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[3];
+ const __m256i d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const __m256i a0 = LoadAligned32(wiener_buffer + 0);
+ const __m256i a1 = LoadAligned32(wiener_buffer + 16);
+ const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8));
+ const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8));
+ const __m256i c0 = _mm256_srai_epi16(b0, 4);
+ const __m256i c1 = _mm256_srai_epi16(b1, 4);
+ const __m256i d = _mm256_packus_epi16(c0, c1);
+ StoreUnaligned32(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 32;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border, const ptrdiff_t stride,
+ const int width, const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 32);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ const __m128i c =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ __m128i c_horizontal =
+ _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+ c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
+ const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 10;
+constexpr int kOverreadInBytesPass2_128 = 12;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+ dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+ dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+ LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+ LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(src0, s1);
+}
+
+// Using VgetLane16() can save a sign extension instruction.
+template <int n>
+inline int VgetLane16(__m256i src) {
+ return _mm256_extract_epi16(src, n);
+}
+
+template <int n>
+inline int VgetLane8(__m256i src) {
+ return _mm256_extract_epi8(src, n);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+ const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareLo8(const __m256i src) {
+ const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
+ return _mm256_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+ const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareHi8(const __m256i src) {
+ const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
+ return _mm256_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+ dst[3] = _mm_srli_si128(src, 3);
+ dst[4] = _mm_srli_si128(src, 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm256_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi16(src0, src1);
+ return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi32(src0, src1);
+ return _mm256_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+ const __m256i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+ const __m256i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+ const __m128i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m256i Sum3WLo32(const __m256i src[3]) {
+ const __m256i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+ const __m128i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m256i Sum3WHi32(const __m256i src[3]) {
+ const __m256i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+ const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+ const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+ const __m256i* const src2, const __m256i* const src3,
+ const __m256i* const src4) {
+ const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+ const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+ const __m256i sum = _mm256_add_epi32(sum01, sum23);
+ return _mm256_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlLo8(src[0], src[1]);
+ const __m128i sum23 = VaddlLo8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WLo16(const __m256i src[5]) {
+ const __m256i sum01 = VaddlLo8(src[0], src[1]);
+ const __m256i sum23 = VaddlLo8(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WHi16(const __m256i src[5]) {
+ const __m256i sum01 = VaddlHi8(src[0], src[1]);
+ const __m256i sum23 = VaddlHi8(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+ __m128i s[3];
+ Prepare3Lo8(src, s);
+ return Sum3WLo16(s);
+}
+
+inline void Sum3Horizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes, __m256i dst[2]) {
+ __m256i s[3];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ return Sum5WLo16(s);
+}
+
+inline void Sum5Horizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const dst0, __m256i* const dst1) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[5];
+ Prepare5_16(src, s);
+ const __m256i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m256i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m256i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m256i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ const __m128i sum04 = VaddlLo8(s[0], s[4]);
+ *row3 = Sum3WLo16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3_0, __m256i* const row3_1,
+ __m256i* const row5_0, __m256i* const row5_1) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+ const __m256i sum04_lo = VaddlLo8(s[0], s[4]);
+ const __m256i sum04_hi = VaddlHi8(s[0], s[4]);
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0);
+ *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0,
+ __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+ __m256i* const row_sq5_1) {
+ __m256i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WLo16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WHi16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343WLo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwLo16(sum3, src[1]);
+}
+
+inline __m256i Sum343WHi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum343WLo(s);
+ dst[1] = Sum343WHi(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565WLo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, src[1]);
+}
+
+inline __m256i Sum565WHi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum565WLo(s);
+ dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ do {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
+ __m128i sq_128[2];
+ __m256i sq[3];
+ __m128i s3, s5, sq3[2], sq5[2];
+ sq_128[0] = SquareLo8(s0);
+ sq_128[1] = SquareHi8(s0);
+ SumHorizontalLo(s0, &s3, &s5);
+ StoreAligned16(sum3, s3);
+ StoreAligned16(sum5, s5);
+ SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+ StoreAligned32U32(square_sum3, sq3);
+ StoreAligned32U32(square_sum5, sq5);
+ src += 8;
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ const __m256i s = LoadUnaligned32Msan(
+ src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width);
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width,
+ &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned64(sum3, row3);
+ StoreAligned64(sum5, row5);
+ SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned64(square_sum3 + 0, row_sq3);
+ StoreAligned64(square_sum5 + 0, row_sq5);
+ SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned64(square_sum3 + 16, row_sq3);
+ StoreAligned64(square_sum5 + 16, row_sq5);
+ sq[0] = sq[2];
+ src += 32;
+ sum3 += 32;
+ sum5 += 32;
+ square_sum3 += 32;
+ square_sum5 += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sum3 += sum_stride - sum_width - 8;
+ sum5 += sum_stride - sum_width - 8;
+ square_sum3 += sum_stride - sum_width - 8;
+ square_sum5 += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int kOverreadInBytes_128, kOverreadInBytes_256;
+ if (size == 3) {
+ kOverreadInBytes_128 = kOverreadInBytesPass2_128;
+ kOverreadInBytes_256 = kOverreadInBytesPass2_256;
+ } else {
+ kOverreadInBytes_128 = kOverreadInBytesPass1_128;
+ kOverreadInBytes_256 = kOverreadInBytesPass1_256;
+ }
+ int y = 2;
+ do {
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width);
+ __m128i ss, sq_128[2], sqs[2];
+ __m256i sq[3];
+ sq_128[0] = SquareLo8(s);
+ sq_128[1] = SquareHi8(s);
+ if (size == 3) {
+ ss = Sum3Horizontal(s);
+ Sum3WHorizontal(sq_128, sqs);
+ } else {
+ ss = Sum5Horizontal(s);
+ Sum5WHorizontal(sq_128, sqs);
+ }
+ StoreAligned16(sums, ss);
+ StoreAligned32U32(square_sums, sqs);
+ src += 8;
+ sums += 8;
+ square_sums += 8;
+ sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i row[2], row_sq[4];
+ const __m256i s = LoadUnaligned32Msan(
+ src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width);
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ if (size == 3) {
+ Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+ row);
+ Sum3WHorizontal(sq + 0, row_sq + 0);
+ Sum3WHorizontal(sq + 1, row_sq + 2);
+ } else {
+ Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+ &row[0], &row[1]);
+ Sum5WHorizontal(sq + 0, row_sq + 0);
+ Sum5WHorizontal(sq + 1, row_sq + 2);
+ }
+ StoreAligned64(sums, row);
+ StoreAligned64(square_sums + 0, row_sq + 0);
+ StoreAligned64(square_sums + 16, row_sq + 2);
+ sq[0] = sq[2];
+ src += 32;
+ sums += 32;
+ square_sums += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sums += sum_stride - sum_width - 8;
+ square_sums += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m256i dxd = _mm256_madd_epi16(sum, sum);
+ // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+ __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+ const __m256i sub = _mm256_sub_epi32(axn, dxd);
+ const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+ const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256());
+ const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256());
+ const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m256i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm256_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
+ static_assert(n == 9 || n == 25, "");
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+template <int n>
+inline __m256i CalculateB(const __m256i sum, const __m256i ma) {
+ static_assert(n == 9 || n == 25, "");
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const __m256i m0 = VmullLo16(ma, sum);
+ const __m256i m1 = VmullHi16(ma, sum);
+ const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+ const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+ const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i* const b) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ *b = CalculateB<n>(sum, maq);
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparision instruction, or from the sign bit of the index.
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+ __m256i mask;
+ mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+ mask = _mm256_or_si256(mask, index);
+ return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+ const int threshold) {
+ const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+ const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+ return _mm256_add_epi8(value, offset);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+ __m256i ma[3], __m256i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ // Use table lookup to read elements which indices are less than 48.
+ const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+ const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+ const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+ const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
+ __m256i idx, mas;
+ // Clip idx to 127 to apply signed comparision instructions.
+ idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+ // All elements which indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ mas = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ mas = _mm256_or_si256(mas, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res2 = ShuffleIndex(c2, idx);
+ mas = _mm256_or_si256(mas, res2);
+
+ // For elements which indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparision instructions.
+ idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+ // Elements which indices are larger than 47 (with value 0) are set to 5.
+ mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+ mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5.
+ mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4.
+ mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3.
+ mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2.
+ mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1.
+
+ ma[2] = _mm256_permute4x64_epi64(mas, 0x93); // 32-39 8-15 16-23 24-31
+ ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31
+ ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+ const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+ b[0] = CalculateB<n>(sum[0], maq0);
+ b[1] = CalculateB<n>(sum[1], maq1);
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[2], const ptrdiff_t x,
+ __m256i sum_b343[2], __m256i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m256i b[3], sum_b111[2];
+ Prepare3_16(b3, b);
+ sum_b111[0] = Sum3WLo32(b);
+ sum_b111[1] = Sum3WHi32(b);
+ sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+ sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+ StoreAligned64(b444 + x, sum_b444);
+ sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+ sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+ StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned32(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned32(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[0][3] = Sum5Horizontal(s[0][0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal(s[1][0]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+ const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3],
+ __m256i b[3]) {
+ const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+ const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+ sq[0][1] = SquareLo8(s0);
+ sq[0][2] = SquareHi8(s0);
+ sq[1][1] = SquareLo8(s1);
+ sq[1][2] = SquareHi8(s1);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+ Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ Sum5WHorizontal(sq[0] + 1, sq5[3]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ Sum5WHorizontal(sq[1] + 1, sq5[4]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[5], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ s5[3] = s5[4] = Sum5Horizontal(s);
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+ const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ Sum5WHorizontal(sq + 1, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s3[3], sq3[3][2];
+ sq[1] = SquareHi8(s);
+ s3[2] = Sum3Horizontal(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3WHorizontal(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3],
+ __m256i ma[3], __m256i b[3]) {
+ const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s3[4], sq3[3][2], sum[2], index[2];
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ Sum3Horizontal(src, over_read_in_bytes, s3 + 2);
+ StoreAligned64(sum3[2] + x, s3 + 2);
+ Sum3WHorizontal(sq + 0, sq3[2]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ LoadAligned32x2U16(sum3, x, s3);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Sum3WHorizontal(sq + 1, sq3[2]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate<9>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2],
+ __m128i b3[2], __m128i* const ma5, __m128i* const b5) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0]);
+ sq[1][1] = SquareHi8(s[1]);
+ SumHorizontalLo(s[0], &s3[2], &s5[3]);
+ SumHorizontalLo(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ // Note: in the SSE4_1 version, CalculateIntermediate() is called
+ // to replace the slow LookupIntermediate() when calculating 16 intermediate
+ // data points. However, the AVX2 compiler generates even slower code. So we
+ // keep using CalculateIntermediate3().
+ CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]);
+ CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]);
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3],
+ __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+ const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+ __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sq3t[4][2], sq5t[5][2],
+ sum_3[2][2], index_3[2][2], sum_5[2], index_5[2];
+ sq[0][1] = SquareLo8(s0);
+ sq[0][2] = SquareHi8(s0);
+ sq[1][1] = SquareLo8(s1);
+ sq[1][2] = SquareHi8(s1);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+ &s5[1][4]);
+ StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+ StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ StoreAligned64(square_sum3[3] + x, sq3[3]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+ &index_3[1][0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ SumHorizontal(sq[0] + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
+ SumHorizontal(sq[1] + 1, &sq3t[3][0], &sq3t[3][1], &sq5t[4][0], &sq5t[4][1]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3t[2]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5t[3]);
+ StoreAligned64(square_sum3[3] + x + 16, sq3t[3]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5t[4]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
+ CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[0][1], &index_3[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3t + 1, scales[1], &sum_3[1][1],
+ &index_3[1][1]);
+ CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
+ CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
+ CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+ b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
+ b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
+ b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+ __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ SumHorizontalLo(s, &s3[2], &s5[3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+ __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s3[2][3], s5[2][5], sq3[4][2], sq3t[4][2], sq5[5][2], sq5t[5][2],
+ sum_3[2], index_3[2], sum_5[2], index_5[2];
+ sq[1] = SquareLo8(s0);
+ sq[2] = SquareHi8(s0);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ SumHorizontal(sq + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
+ CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[1], &index_3[1]);
+ CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
+ sq5t[4][0] = sq5t[3][0];
+ sq5t[4][1] = sq5t[3][1];
+ CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+ b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
+ b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ __m128i ma0, b0, s[2][3], sq_128[2][2];
+ __m256i mas[3], sq[2][3], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0][0]);
+ sq_128[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[2], b[4];
+ BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned64(ma565, ma);
+ Sum565W(bs + 0, b + 0);
+ Sum565W(bs + 1, b + 2);
+ StoreAligned64(b565, b + 0);
+ StoreAligned64(b565 + 16, b + 2);
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ __m128i ma0, sq_128[2], b0;
+ __m256i mas[3], sq[3], bs[3];
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
+ sq_128[0] = SquareLo8(s);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma3[3];
+ BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width,
+ x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ Prepare3_8(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444);
+ ma444 += 32;
+ b444 += 32;
+ } else {
+ __m256i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned64(ma343, ma);
+ Sum343W(bs + 0, b + 0);
+ Sum343W(bs + 1, b + 2);
+ StoreAligned64(b343 + 0, b + 0);
+ StoreAligned64(b343 + 16, b + 2);
+ }
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma343 += 32;
+ b343 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
+ uint32_t* const b444[2], uint32_t* b565) {
+ __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+ __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0]);
+ sq_128[1][0] = SquareLo8(s[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, &b5_0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+ ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[2], b[4], ma3x[3], ma5x[3];
+ BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+ sq, ma3, b3, ma5, b5);
+ Prepare3_8(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned64(ma343[0] + x, ma);
+ Sum343W(b3[0], b);
+ StoreAligned64(b343[0] + x, b);
+ Sum565W(b5, b);
+ StoreAligned64(b565, b);
+ Prepare3_8(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444[0], b343[1],
+ b444[0]);
+ Prepare3_8(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned64(ma565, ma);
+ Sum343W(b3[0] + 1, b);
+ StoreAligned64(b343[0] + x + 16, b);
+ Sum565W(b5 + 1, b);
+ StoreAligned64(b565 + 16, b);
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+ const __m256i b[2]) {
+ const __m256i ma_x_src_lo = VmullLo16(ma, src);
+ const __m256i ma_x_src_hi = VmullHi16(ma, src);
+ const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2],
+ __m256i b[2][2]) {
+ const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+ __m256i b_sum[2];
+ b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src, __m256i ma[3],
+ __m256i b[3][2]) {
+ const __m256i ma_sum = Sum3_16(ma);
+ __m256i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+ const __m256i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+ return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+ const __m256i filter[2], const int w0,
+ const int w2) {
+ __m256i v[2];
+ const __m256i w0_w2 =
+ _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+ const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+ const __m256i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m256i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i ma0, b0, s[2][3], sq_128[2][2];
+ __m256i mas[3], sq[2][3], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0][0]);
+ sq_128[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], ma3[3], b[2][2][2];
+ BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ ma[1] = Sum565Lo(ma3);
+ ma[2] = Sum565Hi(ma3);
+ StoreAligned64(ma565[1] + x, ma + 1);
+ Sum565W(bs + 0, b[0][1]);
+ Sum565W(bs + 1, b[1][1]);
+ StoreAligned64(b565[1] + x + 0, b[0][1]);
+ StoreAligned64(b565[1] + x + 16, b[1][1]);
+ const __m256i sr0 = LoadUnaligned32(src + x);
+ const __m256i sr1 = LoadUnaligned32(src + stride + x);
+ const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]);
+ const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]);
+ const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0);
+ const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0);
+ const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+ ma[1] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[1][0]);
+ const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]);
+ const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]);
+ const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0);
+ const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+ StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ __m128i ma0, b0, sq_128[2];
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0,
+ &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], ma5[3], b[2][2];
+ BoxFilterPreProcess5LastRow(
+ src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ ma[2] = Sum565Hi(ma5);
+ Sum565W(bs + 0, b[1]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565);
+ LoadAligned64(b565 + 0, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma565 + 16);
+ LoadAligned64(b565 + 16, b[0]);
+ Sum565W(bs + 1, b[1]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width);
+ __m128i ma0, b0, sq_128[2];
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[4], b[4][2], ma3[3];
+ BoxFilterPreProcess3(src0 + x + 8,
+ x + 8 + kOverreadInBytesPass2_256 - width, x + 8,
+ sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma343[0] + x + 16);
+ ma[2] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1]);
+ LoadAligned64(b444[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+ __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0]);
+ sq_128[1][0] = SquareLo8(s[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, &b5_0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+ ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+ sq, ma3, b3, ma5, b5);
+ Prepare3_8(ma3[0], ma3x[0]);
+ Prepare3_8(ma3[1], ma3x[1]);
+ Prepare3_8(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ ma[0][2] = Sum565Hi(ma5x);
+ mat[0][1] = ma[0][2];
+ StoreAligned64(ma565[1] + x, ma[0] + 1);
+ Sum565W(b5, b[0][1]);
+ StoreAligned64(b565[1] + x, b[0][1]);
+ const __m256i sr0 = LoadUnaligned32(src + x);
+ const __m256i sr1 = LoadUnaligned32(src + stride + x);
+ const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+ ma[0][0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned32(ma343[0] + x);
+ ma[1][1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[1][0]);
+ LoadAligned64(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned32(ma343[1] + x);
+ LoadAligned64(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Sum565W(b5 + 1, b[0][1]);
+ StoreAligned64(b565[1] + x + 16, b[0][1]);
+ Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+ mat[0][0] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]);
+ mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1][0]);
+ LoadAligned64(b444[0] + x + 16, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]);
+ const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+ LoadAligned64(b343[1] + x + 16, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]);
+ const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+ StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
+ __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5,
+ sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ ma3[0] = SetrM128i(ma3_0, ma3_0);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0] = SetrM128i(b3_0, b3_0);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3];
+ BoxFilterPreProcessLastRow(src0 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width,
+ sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8(ma3, ma3x);
+ Prepare3_8(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565W(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343W(b3, b[2]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ mat[1] = Sum565Hi(ma5x);
+ Sum565W(b5 + 1, b[1]);
+ mat[2] = Sum343Hi(ma3x);
+ Sum343W(b3 + 1, b[2]);
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ mat[0] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
+ mat[0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[0]);
+ LoadAligned64(b444[0] + x + 16, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
+ const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ ma3[0] = ma3[2];
+ ma5[0] = ma5[2];
+ b3[0] = b3[2];
+ b5[0] = b5[2];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const uint8_t* const top_border, const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, stride, width, sum_stride, temp_stride, sum3[0], sum5[1],
+ square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444, ma565[0], b343,
+ b444, b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
+ w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
+ ma565, b343, b444, b565, dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, stride, width, sum_stride, temp_stride, sum5[1],
+ square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
+ w0, sum5, square_sum5, ma565[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, stride, width, sum_stride, temp_stride, sum3[0],
+ square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const void* const top_border, const void* const bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
+ stride, width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
+ stride, width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
+ width, height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_AVX2(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h
new file mode 100644
index 0000000..d80227c
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_avx2.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_AVX2();
+void LoopRestorationInit10bpp_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
new file mode 100644
index 0000000..24f5ad2
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -0,0 +1,2549 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit =
+ (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+ const __m128i offsets = _mm_set1_epi16(-offset);
+ const __m128i limits = _mm_set1_epi16(limit - offset);
+ // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
+ const __m128i sum = _mm_add_epi16(s[0], s[1]);
+ const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
+ // Add back scaled down offset correction.
+ const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
+ const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
+ const __m128i d1 = _mm_min_epi16(d0, limits);
+ StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m128i s[4],
+ const __m128i filter[4],
+ int16_t* const wiener_buffer) {
+ __m128i madds[4];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+ madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
+ madds[0] = _mm_add_epi16(madds[0], madds[2]);
+ madds[1] = _mm_add_epi16(madds[1], madds[3]);
+ const __m128i s_3x128 =
+ _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m128i s[5],
+ const __m128i filter[3],
+ int16_t* const wiener_buffer) {
+ __m128i madds[3];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+ madds[0] = _mm_add_epi16(madds[0], madds[2]);
+ const __m128i s_3x128 =
+ _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m128i s[2],
+ const __m128i filter[2],
+ int16_t* const wiener_buffer) {
+ __m128i madds[2];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ const __m128i s_3x128 =
+ _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+// loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient0,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[4];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
+ filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+ filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
+ filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[7], ss[4];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ s[5] = LoadUnaligned16(src + x + 5);
+ s[6] = LoadUnaligned16(src + x + 6);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], round);
+ WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ ss[3] = _mm_unpackhi_epi8(s[6], round);
+ WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient1,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[3];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
+ filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
+ filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[5], ss[3];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], round);
+ WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ ss[2] = _mm_unpackhi_epi8(s[4], round);
+ WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient2,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+ filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[3], ss[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], round);
+ WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], round);
+ WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m128i s = LoadUnaligned16(src + x);
+ const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
+ const __m128i d0 = _mm_slli_epi16(s0, 4);
+ const __m128i d1 = _mm_slli_epi16(s1, 4);
+ StoreAligned16(*wiener_buffer + x + 0, d0);
+ StoreAligned16(*wiener_buffer + x + 8, d1);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum0 = _mm_add_epi32(round, madd0);
+ const __m128i sum1 = _mm_add_epi32(sum0, madd1);
+ return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum = _mm_add_epi32(madd0, madd1);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m128i madd = _mm_madd_epi16(a, filter);
+ const __m128i sum = _mm_add_epi32(round, madd);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+ const __m128i filter[2]) {
+ __m128i b[2];
+ const __m128i a06 = _mm_add_epi16(a[0], a[6]);
+ const __m128i a15 = _mm_add_epi16(a[1], a[5]);
+ const __m128i a24 = _mm_add_epi16(a[2], a[4]);
+ b[0] = _mm_unpacklo_epi16(a06, a15);
+ b[1] = _mm_unpacklo_epi16(a24, a[3]);
+ const __m128i sum0 = WienerVertical7(b, filter);
+ b[0] = _mm_unpackhi_epi16(a06, a15);
+ b[1] = _mm_unpackhi_epi16(a24, a[3]);
+ const __m128i sum1 = WienerVertical7(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[2];
+ const __m128i a04 = _mm_add_epi16(a[0], a[4]);
+ const __m128i a13 = _mm_add_epi16(a[1], a[3]);
+ b[0] = _mm_unpacklo_epi16(a04, a13);
+ b[1] = _mm_unpacklo_epi16(a[2], round);
+ const __m128i sum0 = WienerVertical5(b, filter);
+ b[0] = _mm_unpackhi_epi16(a04, a13);
+ b[1] = _mm_unpackhi_epi16(a[2], round);
+ const __m128i sum1 = WienerVertical5(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
+ __m128i b;
+ const __m128i a02 = _mm_add_epi16(a[0], a[2]);
+ b = _mm_unpacklo_epi16(a02, a[1]);
+ const __m128i sum0 = WienerVertical3(b, filter);
+ b = _mm_unpackhi_epi16(a02, a[1]);
+ const __m128i sum1 = WienerVertical3(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[7]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[5]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter, __m128i a[3]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i d[2]) {
+ __m128i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i d[2]) {
+ __m128i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter, __m128i d[2]) {
+ __m128i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi32(c, 0x55);
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[7];
+ const __m128i d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = Load4(coefficients);
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(c, 0);
+ filter[1] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[5];
+ const __m128i d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i filter =
+ _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[3];
+ const __m128i d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const __m128i a0 = LoadAligned16(wiener_buffer + 0);
+ const __m128i a1 = LoadAligned16(wiener_buffer + 8);
+ const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
+ const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
+ const __m128i c0 = _mm_srai_epi16(b0, 4);
+ const __m128i c1 = _mm_srai_epi16(b1, 4);
+ const __m128i d = _mm_packus_epi16(c0, c1);
+ StoreAligned16(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ const int16_t* const filter_horizontal =
+ restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+ const __m128i c = LoadLo8(filter_horizontal);
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ const __m128i coefficients_horizontal =
+ _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, filter_horizontal[0],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ filter_horizontal[0], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ filter_horizontal[0], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, filter_horizontal[1],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ filter_horizontal[1], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ filter_horizontal[1], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, filter_horizontal[2],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ filter_horizontal[2], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ filter_horizontal[2], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 10;
+constexpr int kOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+ dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+ const ptrdiff_t border, __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+ dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+ LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+ StoreAligned32U32(dst + 0, src + 0);
+ StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+ const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+ const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+ dst[3] = _mm_srli_si128(src, 3);
+ dst[4] = _mm_srli_si128(src, 4);
+}
+
+template <int offset>
+inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+ const __m128i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+ const __m128i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+ const __m128i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlLo8(src[0], src[1]);
+ const __m128i sum23 = VaddlLo8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m128i Sum5WHi16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlHi8(src[0], src[1]);
+ const __m128i sum23 = VaddlHi8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+ __m128i s[3];
+ Prepare3Lo8(src, s);
+ return Sum3WLo16(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_8<offset>(src, s);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ return Sum5WLo16(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
+ __m128i* const dst1) {
+ __m128i s[5];
+ Prepare5_8<offset>(src, s);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ const __m128i sum04 = VaddlLo8(s[0], s[4]);
+ *row3 = Sum3WLo16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+template <int offset>
+void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
+ __m128i* const row3_1, __m128i* const row5_0,
+ __m128i* const row5_1) {
+ __m128i s[5];
+ Prepare5_8<offset>(src, s);
+ const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
+ const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
+ *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WLo16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WHi16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343WLo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwLo16(sum3, src[1]);
+}
+
+inline __m128i Sum343WHi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum343WLo(s);
+ dst[1] = Sum343WHi(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565WLo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, src[1]);
+}
+
+inline __m128i Sum565WHi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum565WLo(s);
+ dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ do {
+ __m128i s[2], sq[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ x -= 16;
+ src += 16;
+ s[1] = LoadUnaligned16Msan(src,
+ sum_width - x + kOverreadInBytesPass1 - width);
+ sq[1] = SquareHi8(s[0]);
+ sq[2] = SquareLo8(s[1]);
+ SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned32U16(sum3, row3);
+ StoreAligned32U16(sum5, row5);
+ SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 8, row_sq3);
+ StoreAligned32U32(square_sum5 + 8, row_sq5);
+ s[0] = s[1];
+ sq[0] = sq[2];
+ sum3 += 16;
+ sum5 += 16;
+ square_sum3 += 16;
+ square_sum5 += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ constexpr int kOverreadInBytes =
+ (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
+ int y = 2;
+ do {
+ __m128i s[2], sq[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row[2], row_sq[4];
+ x -= 16;
+ src += 16;
+ s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
+ sq[1] = SquareHi8(s[0]);
+ sq[2] = SquareLo8(s[1]);
+ if (size == 3) {
+ Sum3Horizontal<0>(s, row);
+ Sum3WHorizontal(sq + 0, row_sq + 0);
+ Sum3WHorizontal(sq + 1, row_sq + 2);
+ } else {
+ Sum5Horizontal<0>(s, &row[0], &row[1]);
+ Sum5WHorizontal(sq + 0, row_sq + 0);
+ Sum5WHorizontal(sq + 1, row_sq + 2);
+ }
+ StoreAligned32U16(sums, row);
+ StoreAligned64U32(square_sums, row_sq);
+ s[0] = s[1];
+ sq[0] = sq[2];
+ sums += 16;
+ square_sums += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
+ static_assert(n == 9 || n == 25, "");
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i* const b) {
+ static_assert(n == 9 || n == 25, "");
+ static_assert(offset == 0 || offset == 8, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ __m128i maq;
+ if (offset == 0) {
+ maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ } else {
+ maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ }
+ *b = CalculateB<n>(sum, maq);
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparision instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i* const b0,
+ __m128i* const b1) {
+ // Use table lookup to read elements which indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparision instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements which indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements which indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparision instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements which indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ *b0 = CalculateB<9>(sum[0], maq0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ *b1 = CalculateB<9>(sum[1], maq1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i ma[2], __m128i b[2]) {
+ __m128i mas;
+ CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
+ ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+ ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ static_assert(offset == 0 || offset == 8, "");
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
+ __m128i sum_b343[2], __m128i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m128i b[3], sum_b111[2];
+ Prepare3_16(b3, b);
+ sum_b111[0] = Sum3WLo32(b);
+ sum_b111[1] = Sum3WHi32(b);
+ sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+ sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+ StoreAligned32U32(b444 + x, sum_b444);
+ sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+ sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+ StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[0][3] = Sum5Horizontal(s[0][0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal(s[1][0]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Sum5WHorizontal(sq[0] + 1, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ Sum5WHorizontal(sq[1] + 1, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ Sum5WHorizontal(sq[0] + 2, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ Sum5WHorizontal(sq[1] + 2, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[5], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ s5[3] = s5[4] = Sum5Horizontal(s);
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[2] = SquareLo8(s[1]);
+ Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5WHorizontal(sq + 1, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+ sq[3] = SquareHi8(s[1]);
+ Sum5WHorizontal(sq + 2, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s3[3], sq3[3][2];
+ sq[1] = SquareHi8(s);
+ s3[2] = Sum3Horizontal(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3WHorizontal(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
+ const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s3[4], sq3[3][2], sum[2], index[2];
+ sq[2] = SquareLo8(s[1]);
+ Sum3Horizontal<8>(s, s3 + 2);
+ StoreAligned32U16(sum3[2] + x, s3 + 2);
+ Sum3WHorizontal(sq + 1, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ sq[3] = SquareHi8(s[1]);
+ Sum3WHorizontal(sq + 2, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma, b + 1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
+ __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
+ SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
+ __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
+ __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+ SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+ &index[1][0]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+ &index[1][1]);
+ CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
+ CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+ __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ SumHorizontalLo(s, &s3[2], &s5[3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
+ __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
+ __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+ sq[2] = SquareLo8(s[1]);
+ SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+ sq[3] = SquareHi8(s[1]);
+ SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma3, b3 + 1);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ __m128i s[2][2], mas[2], sq[2][4], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma5[3], ma[2], b[4];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned32U16(ma565, ma);
+ Sum565W(bs + 0, b + 0);
+ Sum565W(bs + 1, b + 2);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ s[1] = LoadUnaligned16Msan(src + x + 16,
+ x + 16 + kOverreadInBytesPass2 - width);
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ __m128i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned32U16(ma343, ma);
+ Sum343W(bs + 0, b + 0);
+ Sum343W(bs + 1, b + 2);
+ StoreAligned64U32(b343, b);
+ }
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
+ uint32_t* const b444[2], uint32_t* b565) {
+ __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], b[4], ma3x[3], ma5x[3];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ Sum343W(b3[0] + 0, b + 0);
+ Sum343W(b3[0] + 1, b + 2);
+ StoreAligned64U32(b343[0] + x, b);
+ Sum565W(b5 + 0, b + 0);
+ Sum565W(b5 + 1, b + 2);
+ StoreAligned64U32(b565, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1],
+ b444[0]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned32U16(ma565, ma);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m128i v = _mm_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+ const __m128i b[2]) {
+ const __m128i ma_x_src_lo = VmullLo16(ma, src);
+ const __m128i ma_x_src_hi = VmullHi16(ma, src);
+ const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
+ __m128i b[2][2]) {
+ const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+ __m128i b_sum[2];
+ b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3],
+ __m128i b[3][2]) {
+ const __m128i ma_sum = Sum3_16(ma);
+ __m128i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+ const __m128i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+ return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+ const __m128i filter[2], const int w0,
+ const int w2) {
+ __m128i v[2];
+ const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+ const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+ const __m128i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m128i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2][2], mas[2], sq[2][4], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma3[3], b[2][2], sr[2], p[2];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma3);
+ ma[1] = Sum565Lo(ma3);
+ StoreAligned16(ma565[1] + x, ma[1]);
+ Sum565W(bs, b[1]);
+ StoreAligned32U32(b565[1] + x, b[1]);
+ sr[0] = LoadAligned16(src + x);
+ sr[1] = LoadAligned16(src + stride + x);
+ const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
+ const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+ const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+ ma[1] = Sum565Hi(ma3);
+ StoreAligned16(ma565[1] + x + 8, ma[1]);
+ Sum565W(bs + 1, b[1]);
+ StoreAligned32U32(b565[1] + x + 8, b[1]);
+ const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
+ const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+ const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+ StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint8_t* const dst) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2];
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+ sq, mas, bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ Sum565W(bs, b[1]);
+ ma[0] = LoadAligned16(ma565);
+ LoadAligned32U32(b565, b[0]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+ ma[1] = Sum565Hi(ma5);
+ Sum565W(bs + 1, b[1]);
+ ma[0] = LoadAligned16(ma565 + 8);
+ LoadAligned32U32(b565 + 8, b[0]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint8_t* const dst) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass2 - width);
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma[3], b[3][2], ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+ Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ StoreAligned16(ma565[1] + x, ma[0][1]);
+ Sum565W(b5, b[0][1]);
+ StoreAligned32U32(b565[1] + x, b[0][1]);
+ const __m128i sr0 = LoadAligned16(src + x);
+ const __m128i sr1 = LoadAligned16(src + stride + x);
+ const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
+ const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
+ ma[0][0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x);
+ ma[1][1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[1][0]);
+ LoadAligned32U32(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned16(ma343[1] + x);
+ LoadAligned32U32(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565Hi(ma5x);
+ StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+ Sum565W(b5 + 1, b[0][1]);
+ StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+ const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
+ const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
+ ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+ ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+ LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+ const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
+ square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
+ &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma3x[3], ma5x[3], p[2];
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8<0>(ma3, ma3x);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565W(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343W(b3, b[2]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ ma[1] = Sum565Hi(ma5x);
+ Sum565W(b5 + 1, b[1]);
+ ma[2] = Sum343Hi(ma3x);
+ Sum343W(b3 + 1, b[2]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ b3[0] = b3[2];
+ b5[0] = b5[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const uint8_t* const top_border, const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1],
+ square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444, ma565[0], b343,
+ b444, b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
+ w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
+ ma565, b343, b444, b565, dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1],
+ square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
+ w0, sum5, square_sum5, ma565[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0],
+ square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const void* const top_border, const void* const bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
+ stride, width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
+ stride, width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
+ width, height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+ static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+ static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h
new file mode 100644
index 0000000..65b2b11
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_sse4.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_SSE4_1();
+void LoopRestorationInit10bpp_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
new file mode 100644
index 0000000..d8036be
--- /dev/null
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -0,0 +1,447 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Width can only be 4 when it is subsampled from a block of width 8, hence
+// subsampling_x is always 1 when this function is called.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ if (subsampling_y == 1) {
+ const __m128i next_mask_val_0 =
+ _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
+ const __m128i next_mask_val_1 =
+ _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
+ subsampled_mask = _mm_add_epi16(
+ subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+ }
+ return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ }
+ const __m128i mask_val_0 = Load4(mask);
+ const __m128i mask_val_1 = Load4(mask + mask_stride);
+ return _mm_cvtepu8_epi16(
+ _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+// This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
+// 16-bit is also the lowest packing for hadd, but without subsampling there is
+// an unfortunate conversion required.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) {
+ if (subsampling_x == 1) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+ if (subsampling_y == 1) {
+ const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+ const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+ const __m128i next_mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+ subsampled_mask = _mm_add_epi16(
+ subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+ }
+ return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val = LoadLo8(mask);
+ return _mm_cvtepu8_epi16(mask_val);
+}
+
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) {
+ if (subsampling_x == 1) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+ if (subsampling_y == 1) {
+ const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+ const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+ const __m128i next_mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+ subsampled_mask = _mm_add_epi16(
+ subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+ }
+ const __m128i ret =
+ RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ return _mm_packus_epi16(ret, ret);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ // Unfortunately there is no shift operation for 8-bit packing, or else we
+ // could return everything with 8-bit packing.
+ const __m128i mask_val = LoadLo8(mask);
+ return mask_val;
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
+ const int16_t* const pred_1,
+ const __m128i pred_mask_0,
+ const __m128i pred_mask_1, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadAligned16(pred_0);
+ const __m128i pred_val_1 = LoadAligned16(pred_1);
+ const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+ const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+ const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+ const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+ const __m128i compound_pred = _mm_packus_epi32(
+ _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6));
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i result = RightShiftWithRounding_S16(compound_pred, 4);
+ const __m128i res = _mm_packus_epi16(result, result);
+ Store4(dst, res);
+ Store4(dst + dst_stride, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1,
+ const uint8_t* mask,
+ const ptrdiff_t mask_stride, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ int y = 0;
+ do {
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1,
+ const ptrdiff_t /*prediction_stride_1*/,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = width;
+ if (width == 4) {
+ MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+ const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadAligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadAligned16(pred_1 + x);
+ const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+ const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+
+ const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6),
+ _mm_srli_epi32(compound_pred_hi, 6));
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i result = RightShiftWithRounding_S16(res, 4);
+ StoreLo8(dst + x, _mm_packus_epi16(result, result));
+
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
+ uint8_t* const pred_1,
+ const ptrdiff_t pred_stride_1,
+ const __m128i pred_mask_0,
+ const __m128i pred_mask_1) {
+ const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadLo8(pred_0);
+ // TODO(b/150326556): One load.
+ __m128i pred_val_1 = Load4(pred_1);
+ pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
+ pred_val_1);
+ const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_1[x] +
+ // (64 - mask_value) * prediction_0[x]) >> 6;
+ const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+ const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+ const __m128i res = _mm_packus_epi16(result, result);
+
+ Store4(pred_1, res);
+ Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0,
+ uint8_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* mask,
+ const ptrdiff_t mask_stride) {
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const __m128i pred_mask_u16_first =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ mask += mask_stride << (1 + subsampling_y);
+ const __m128i pred_mask_u16_second =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ mask += mask_stride << (1 + subsampling_y);
+ __m128i pred_mask_1 =
+ _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
+ __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+
+ pred_mask_1 = _mm_srli_si128(pred_mask_1, 8);
+ pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0,
+ uint8_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride,
+ const int height) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ return;
+ }
+ int y = 0;
+ do {
+ InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+
+ InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0,
+ uint8_t* prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height) {
+ if (width == 4) {
+ InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
+ prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+ height);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_1 =
+ GetInterIntraMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadLo8(prediction_0 + x);
+ const __m128i pred_val_1 = LoadLo8(prediction_1 + x);
+ const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_1[x] +
+ // (64 - mask_value) * prediction_0[x]) >> 6;
+ const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+ const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+ const __m128i res = _mm_packus_epi16(result, result);
+
+ StoreLo8(prediction_1 + x, res);
+
+ x += 8;
+ } while (x < width);
+ prediction_0 += width;
+ prediction_1 += prediction_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
+ dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
+ dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
+ dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
+#endif
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h
new file mode 100644
index 0000000..52b0b5c
--- /dev/null
+++ b/src/dsp/x86/mask_blend_sse4.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc
new file mode 100644
index 0000000..c506941
--- /dev/null
+++ b/src/dsp/x86/motion_field_projection_sse4.cc
@@ -0,0 +1,397 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i LoadDivision(const __m128i division_table,
+ const __m128i reference_offset) {
+ const __m128i kOne = _mm_set1_epi16(0x0100);
+ const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
+ const __m128i tt = _mm_unpacklo_epi8(t, t);
+ const __m128i idx = _mm_add_epi8(tt, kOne);
+ return _mm_shuffle_epi8(division_table, idx);
+}
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const int numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
+ const int numerator) {
+ const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
+ const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
+ const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
+ const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
+ const __m128i s0 = MvProjection(mv0, denorm0, numerator);
+ const __m128i s1 = MvProjection(mv1, denorm1, numerator);
+ const __m128i projection = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
+ // Add 63 to negative delta so that it shifts towards zero.
+ const __m128i delta_sign = _mm_srai_epi16(delta, 15);
+ const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
+ const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
+ const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
+ const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
+ return _mm_sub_epi16(offset1, dst_sign);
+}
+
+inline void GetPosition(
+ const __m128i division_table, const MotionVector* const mv,
+ const int numerator, const int x8_start, const int x8_end, const int x8,
+ const __m128i& r_offsets, const __m128i& source_reference_type8,
+ const __m128i& skip_r, const __m128i& y8_floor8, const __m128i& y8_ceiling8,
+ const __m128i& d_sign, const int delta, __m128i* const r,
+ __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
+ const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+ *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
+ const __m128i denorm = LoadDivision(division_table, source_reference_type8);
+ __m128i projection_mv[2];
+ mvs[0] = LoadUnaligned16(mv_int + 0);
+ mvs[1] = LoadUnaligned16(mv_int + 4);
+ // Deinterlace x and y components
+ const __m128i kShuffle =
+ _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+ const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
+ const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
+ const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
+ const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
+ // numerator could be 0.
+ projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
+ projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
+ // Do not update the motion vector if the block position is not valid or
+ // if position_x8 is outside the current range of x8_start and x8_end.
+ // Note that position_y8 will always be within the range of y8_start and
+ // y8_end.
+ // After subtracting the base, valid projections are within 8-bit.
+ const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
+ const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
+ const __m128i positions = _mm_packs_epi16(position_x, position_y);
+ const __m128i k01234567 =
+ _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+ *position_xy = _mm_add_epi8(positions, k01234567);
+ const int x8_floor = std::max(
+ x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8]
+ const int x8_ceiling =
+ std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
+ 1; // [-1, 15]
+ const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
+ const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
+ const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
+ const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
+ const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
+ const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
+ const __m128i out = _mm_or_si128(underflow, overflow);
+ const __m128i skip_low = _mm_or_si128(skip_r, out);
+ const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
+ StoreLo8(skip_64, skip);
+}
+
+template <int idx>
+inline void Store(const __m128i position, const __m128i reference_offset,
+ const __m128i mv, int8_t* dst_reference_offset,
+ MotionVector* dst_mv) {
+ const ptrdiff_t offset =
+ static_cast<int16_t>(_mm_extract_epi16(position, idx));
+ if ((idx & 3) == 0) {
+ dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+ } else {
+ dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+ }
+ dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const __m128i position,
+ const __m128i reference_offset, const __m128i mv,
+ int8_t* dst_reference_offset, MotionVector* dst_mv) {
+ if (skips[idx] == 0) {
+ Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+ }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_SSE4_1(
+ const ReferenceInfo& reference_info,
+ const int reference_to_current_with_sign, const int dst_sign,
+ const int y8_start, const int y8_end, const int x8_start, const int x8_end,
+ TemporalMotionField* const motion_field) {
+ const ptrdiff_t stride = motion_field->mv.columns();
+ // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+ // coordinates in that range could end up being position_x8 because of
+ // projection.
+ const int adjusted_x8_start =
+ std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+ const int adjusted_x8_end = std::min(
+ x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+ const int leftover = adjusted_x8_end - adjusted_x8_end8;
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+ int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+ MotionVector* dst_mv = motion_field->mv[y8_start];
+ const __m128i d_sign = _mm_set1_epi16(dst_sign);
+
+ static_assert(sizeof(int8_t) == sizeof(bool), "");
+ static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+ static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+ assert(dst_sign == 0 || dst_sign == -1);
+ assert(stride == motion_field->reference_offset.columns());
+ assert((y8_start & 7) == 0);
+ assert((adjusted_x8_start & 7) == 0);
+ // The final position calculation is represented with int16_t. Valid
+ // position_y8 from its base is at most 7. After considering the horizontal
+ // offset which is at most |stride - 1|, we have the following assertion,
+ // which means this optimization works for frame width up to 32K (each
+ // position is a 8x8 block).
+ assert(8 * stride <= 32768);
+ const __m128i skip_reference = LoadLo8(skip_references);
+ const __m128i r_offsets = LoadLo8(reference_offsets);
+ const __m128i division_table = LoadUnaligned16(projection_divisions);
+
+ int y8 = y8_start;
+ do {
+ const int y8_floor = (y8 & ~7) - y8; // [-7, 0]
+ const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1; // [0, 7]
+ const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
+ const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
+ int x8;
+
+ for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+ const __m128i source_reference_type8 =
+ LoadLo8(source_reference_types + x8);
+ const __m128i skip_r =
+ _mm_shuffle_epi8(skip_reference, source_reference_type8);
+ int64_t early_skip;
+ StoreLo8(&early_skip, skip_r);
+ // Early termination #1 if all are skips. Chance is typically ~30-40%.
+ if (early_skip == -1) continue;
+ int64_t skip_64;
+ __m128i r, position_xy, mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+ x8_end, x8, r_offsets, source_reference_type8, skip_r,
+ y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
+ mvs);
+ // Early termination #2 if all are skips.
+ // Chance is typically ~15-25% after Early termination #1.
+ if (skip_64 == -1) continue;
+ const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+ const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+ const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+ const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+ const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+ if (skip_64 == 0) {
+ // Store all. Chance is typically ~70-85% after Early termination #2.
+ Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // Chance is typically ~15-30% after Early termination #2.
+ // The compiler is smart enough to not create the local buffer skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ }
+ }
+
+ // The following leftover processing cannot be moved out of the do...while
+ // loop. Doing so may change the result storing orders of the same position.
+ if (leftover > 0) {
+ // Use SIMD only when leftover is at least 4, and there are at least 8
+ // elements in a row.
+ if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+ // Process the last 8 elements to avoid loading invalid memory. Some
+ // elements may have been processed in the above loop, which is OK.
+ const int delta = 8 - leftover;
+ x8 = adjusted_x8_end - 8;
+ const __m128i source_reference_type8 =
+ LoadLo8(source_reference_types + x8);
+ const __m128i skip_r =
+ _mm_shuffle_epi8(skip_reference, source_reference_type8);
+ int64_t early_skip;
+ StoreLo8(&early_skip, skip_r);
+ // Early termination #1 if all are skips.
+ if (early_skip != -1) {
+ int64_t skip_64;
+ __m128i r, position_xy, mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign,
+ x8_start, x8_end, x8, r_offsets, source_reference_type8,
+ skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+ &position_xy, &skip_64, mvs);
+ // Early termination #2 if all are skips.
+ if (skip_64 != -1) {
+ const __m128i p_y =
+ _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+ const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+ const __m128i p_y_offset =
+ _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+ const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+ const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+ // Store up to 7 elements since leftover is at most 7.
+ if (skip_64 == 0) {
+ // Store all.
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // The compiler is smart enough to not create the local buffer
+ // skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ }
+ }
+ }
+ } else {
+ for (; x8 < adjusted_x8_end; ++x8) {
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
+ MotionVector projection_mv;
+ // reference_to_current_with_sign could be 0.
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
+ &projection_mv);
+ // Do not update the motion vector if the block position is not valid
+ // or if position_x8 is outside the current range of x8_start and
+ // x8_end. Note that position_y8 will always be within the range of
+ // y8_start and y8_end.
+ const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+ if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
+ const int x8_base = x8 & ~7;
+ const int x8_floor =
+ std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+ const int x8_ceiling =
+ std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+ const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+ if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+ dst_mv[position_y8 * stride + position_x8] = mv[x8];
+ dst_reference_offset[position_y8 * stride + position_x8] =
+ reference_offsets[source_reference_type];
+ }
+ }
+ }
+
+ source_reference_types += stride;
+ mv += stride;
+ dst_reference_offset += stride;
+ dst_mv += stride;
+ } while (++y8 < y8_end);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+#endif
+
+} // namespace
+
+void MotionFieldProjectionInit_SSE4_1() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_field_projection_sse4.h b/src/dsp/x86/motion_field_projection_sse4.h
new file mode 100644
index 0000000..c05422c
--- /dev/null
+++ b/src/dsp/x86/motion_field_projection_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
new file mode 100644
index 0000000..e9cdd4c
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.cc
@@ -0,0 +1,262 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+ 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780,
+ 744, 712, 682, 655, 630, 606, 585, 564, 546, 528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const __m128i numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+ const __m128i denominators[2],
+ const __m128i numerator) {
+ const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+ const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+ const __m128i mv = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t temporal_reference_offsets[2],
+ const int reference_offsets[2]) {
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadLo8(tmvs);
+ const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+ mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+ denominators[0] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+ denominators[1] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+ const __m128i offsets = LoadLo8(reference_offsets);
+ const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offset) {
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadAligned16(tmvs);
+ __m128i lookup = _mm_cvtsi32_si128(
+ kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+ 1);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+ 2);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+ 3);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+ mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+ denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+ denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+ const __m128i numerator = _mm_set1_epi32(reference_offset);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+ const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+ StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+ const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+ const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+ StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 4;
+ } while (i < count);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+#endif
+
+} // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_vector_search_sse4.h b/src/dsp/x86/motion_vector_search_sse4.h
new file mode 100644
index 0000000..d65b392
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
new file mode 100644
index 0000000..3a1d1fd
--- /dev/null
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -0,0 +1,329 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int y = height;
+ do {
+ const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
+ const __m128i obmc_pred_val =
+ Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
+
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store2(pred, packed_result);
+ pred += prediction_stride;
+ const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
+ memcpy(pred, &second_row_result, sizeof(second_row_result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = Load4(kObmcMask + 2);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ // Duplicate first half of vector.
+ const __m128i masks =
+ _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
+ int y = height;
+ do {
+ const __m128i pred_val0 = Load4(pred);
+ const __m128i obmc_pred_val0 = Load4(obmc_pred);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ // Place the second row of each source in the second four bytes.
+ const __m128i pred_val =
+ _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+ const __m128i obmc_pred_val = _mm_alignr_epi8(
+ Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store4(pred - prediction_stride, packed_result);
+ const int second_row_result = _mm_extract_epi32(packed_result, 1);
+ memcpy(pred, &second_row_result, sizeof(second_row_result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft8xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const __m128i mask_val = LoadLo8(kObmcMask + 6);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int y = height;
+ do {
+ const __m128i pred_val = LoadLo8(pred);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+ StoreLo8(pred, _mm_packus_epi16(result, result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (--y != 0);
+}
+
+void OverlapBlendFromLeft_SSE4_1(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+ if (width == 2) {
+ OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 8) {
+ OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const uint8_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ pred = static_cast<uint8_t*>(prediction) + x;
+ obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+ const __m128i mask_val = LoadUnaligned16(mask + x);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
+
+ int y = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
+ StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
+
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < height);
+ x += 16;
+ } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+ const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height = height - (height >> 2);
+ int y = 0;
+ do {
+ // First mask in the first half, second mask in the second half.
+ const __m128i mask_val = _mm_shuffle_epi8(
+ _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
+ mask_shuffler);
+ const __m128i masks =
+ _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+ const __m128i pred_val0 = Load4(pred);
+
+ const __m128i obmc_pred_val0 = Load4(obmc_pred);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ const __m128i pred_val =
+ _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+ const __m128i obmc_pred_val = _mm_alignr_epi8(
+ Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+ const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store4(pred - prediction_stride, packed_result);
+ Store4(pred, _mm_srli_si128(packed_result, 4));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ y += 2;
+ } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop8xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const uint8_t* mask = kObmcMask + height - 2;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const int compute_height = height - (height >> 2);
+ int y = compute_height;
+ do {
+ const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i pred_val = LoadLo8(pred);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+ StoreLo8(pred, _mm_packus_epi16(result, result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (--y != 0);
+}
+
+void OverlapBlendFromTop_SSE4_1(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+ if (width <= 4) {
+ OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 8) {
+ OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+
+ // Stop when mask value becomes 64.
+ const int compute_height = height - (height >> 2);
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ int y = 0;
+ const uint8_t* mask = kObmcMask + height - 2;
+ do {
+ const __m128i mask_val = _mm_set1_epi8(mask[y]);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int x = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred + x);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
+ StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
+ x += 16;
+ } while (x < width);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < compute_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void ObmcInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h
new file mode 100644
index 0000000..bd8b416
--- /dev/null
+++ b/src/dsp/x86/obmc_sse4.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend[]. This function is not thread-safe.
+void ObmcInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
new file mode 100644
index 0000000..b2bdfd2
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.cc
@@ -0,0 +1,166 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+ kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+ {0, 0, 0, -128, 0, 0, 0, 0}, {0, 0, 1, -128, -2, 1, 0, 0},
+ {0, -1, 3, -127, -4, 2, -1, 0}, {0, -1, 4, -127, -6, 3, -1, 0},
+ {0, -2, 6, -126, -8, 3, -1, 0}, {0, -2, 7, -125, -11, 4, -1, 0},
+ {1, -2, 8, -125, -13, 5, -2, 0}, {1, -3, 9, -124, -15, 6, -2, 0},
+ {1, -3, 10, -123, -18, 6, -2, 1}, {1, -3, 11, -122, -20, 7, -3, 1},
+ {1, -4, 12, -121, -22, 8, -3, 1}, {1, -4, 13, -120, -25, 9, -3, 1},
+ {1, -4, 14, -118, -28, 9, -3, 1}, {1, -4, 15, -117, -30, 10, -4, 1},
+ {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+ {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+ {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+ {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+ {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+ {1, -6, 20, -97, -58, 17, -6, 1}, {1, -6, 20, -95, -61, 18, -6, 1},
+ {2, -7, 20, -93, -64, 18, -6, 2}, {2, -7, 20, -91, -66, 19, -6, 1},
+ {2, -7, 20, -88, -69, 19, -6, 1}, {2, -7, 20, -86, -71, 19, -6, 1},
+ {2, -7, 20, -84, -74, 20, -7, 2}, {2, -7, 20, -81, -76, 20, -7, 1},
+ {2, -7, 20, -79, -79, 20, -7, 2}, {1, -7, 20, -76, -81, 20, -7, 2},
+ {2, -7, 20, -74, -84, 20, -7, 2}, {1, -6, 19, -71, -86, 20, -7, 2},
+ {1, -6, 19, -69, -88, 20, -7, 2}, {1, -6, 19, -66, -91, 20, -7, 2},
+ {2, -6, 18, -64, -93, 20, -7, 2}, {1, -6, 18, -61, -95, 20, -6, 1},
+ {1, -6, 17, -58, -97, 20, -6, 1}, {1, -6, 17, -56, -99, 20, -6, 1},
+ {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+ {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+ {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+ {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+ {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+ {1, -3, 9, -28, -118, 14, -4, 1}, {1, -3, 9, -25, -120, 13, -4, 1},
+ {1, -3, 8, -22, -121, 12, -4, 1}, {1, -3, 7, -20, -122, 11, -3, 1},
+ {1, -2, 6, -18, -123, 10, -3, 1}, {0, -2, 6, -15, -124, 9, -3, 1},
+ {0, -2, 5, -13, -125, 8, -2, 1}, {0, -1, 4, -11, -125, 7, -2, 0},
+ {0, -1, 3, -8, -126, 6, -2, 0}, {0, -1, 3, -6, -127, 4, -1, 0},
+ {0, -1, 2, -4, -127, 3, -1, 0}, {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint8_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ do {
+ for (int i = 0; i < 8; ++i, dst += 16) {
+ int remainder = subpixel_x & kSuperResScaleMask;
+ __m128i filter =
+ LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ remainder = subpixel_x & kSuperResScaleMask;
+ filter = LoadHi8(filter,
+ kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ StoreAligned16(dst, filter);
+ }
+ } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+ const ptrdiff_t stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest) {
+ auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint8_t*>(coefficients);
+ uint8_t* dst_ptr = dst;
+ ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ // The below code calculates up to 15 extra upscaled
+ // pixels which will over-read up to 15 downscaled pixels in the end of each
+ // row. kSuperResHorizontalBorder accounts for this.
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ do {
+ __m128i weighted_src[8];
+ for (int i = 0; i < 8; ++i, filter += 16) {
+ __m128i s = LoadLo8(&src[subpixel_x >> kSuperResScaleBits]);
+ subpixel_x += step;
+ s = LoadHi8(s, &src[subpixel_x >> kSuperResScaleBits]);
+ subpixel_x += step;
+ const __m128i f = LoadAligned16(filter);
+ weighted_src[i] = _mm_maddubs_epi16(s, f);
+ }
+
+ __m128i a[4];
+ a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+ a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+ a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+ a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+ Transpose2x16_U16(a, a);
+ a[0] = _mm_adds_epi16(a[0], a[1]);
+ a[1] = _mm_adds_epi16(a[2], a[3]);
+ const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+ a[0] = _mm_subs_epi16(rounding, a[0]);
+ a[1] = _mm_subs_epi16(rounding, a[1]);
+ a[0] = _mm_srai_epi16(a[0], kFilterBits);
+ a[1] = _mm_srai_epi16(a[1], kFilterBits);
+ StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+ dst_ptr += 16;
+ } while (--x != 0);
+ src += stride;
+ dst += stride;
+ } while (--y != 0);
+}
+
+void Init8bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+ dsp->super_res = SuperRes_SSE4_1;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h
new file mode 100644
index 0000000..aef5147
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res_row. This function is not thread-safe.
+void SuperResInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h
new file mode 100644
index 0000000..208b301
--- /dev/null
+++ b/src/dsp/x86/transpose_sse4.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <emmintrin.h>
+
+namespace libgav1 {
+namespace dsp {
+
+LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
+ __m128i* const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 10 11 20 21 30 31
+ // in[0]: 40 41 50 51 60 61 70 71
+ // in[0]: 80 81 90 91 a0 a1 b0 b1
+ // in[0]: c0 c1 d0 d1 e0 e1 f0 f1
+ // to:
+ // a0: 00 40 01 41 10 50 11 51
+ // a1: 20 60 21 61 30 70 31 71
+ // a2: 80 c0 81 c1 90 d0 91 d1
+ // a3: a0 e0 a1 e1 b0 f0 b1 f1
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]);
+ // b0: 00 20 40 60 01 21 41 61
+ // b1: 10 30 50 70 11 31 51 71
+ // b2: 80 a0 c0 e0 81 a1 c1 e1
+ // b3: 90 b0 d0 f0 91 b1 d1 f1
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 80 90 a0 b0 c0 d0 e0 f0
+ // out[3]: 81 91 a1 b1 c1 d1 e1 f1
+ out[0] = _mm_unpacklo_epi16(b0, b1);
+ out[1] = _mm_unpackhi_epi16(b0, b1);
+ out[2] = _mm_unpacklo_epi16(b2, b3);
+ out[3] = _mm_unpackhi_epi16(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ return _mm_unpacklo_epi16(a0, a1);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in,
+ __m128i* out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+ // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+ // out[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // out[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // out[2]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // out[3]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi32(b0, b1);
+ out[1] = _mm_unpackhi_epi32(b0, b1);
+ out[2] = _mm_unpacklo_epi32(b2, b3);
+ out[3] = _mm_unpackhi_epi32(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4_U16(const __m128i* in, __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // to:
+ // ba: 00 10 01 11 02 12 03 13
+ // dc: 20 30 21 31 22 32 23 33
+ const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+ // Unpack 32 bit elements resulting in:
+ // dcba_lo: 00 10 20 30 01 11 21 31
+ // dcba_hi: 02 12 22 32 03 13 23 33
+ const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+ const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+ // Assign or shift right by 8 bytes resulting in:
+ // out[0]: 00 10 20 30 01 11 21 31
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 03 13 23 33
+ // out[3]: 03 13 23 33 XX XX XX XX
+ out[0] = dcba_lo;
+ out[1] = _mm_srli_si128(dcba_lo, 8);
+ out[2] = dcba_hi;
+ out[3] = _mm_srli_si128(dcba_hi, 8);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4_U16(const __m128i* in,
+ __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // in[4]: 40 41 42 43 XX XX XX XX
+ // in[5]: 50 51 52 53 XX XX XX XX
+ // in[6]: 60 61 62 63 XX XX XX XX
+ // in[7]: 70 71 72 73 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8_U16(const __m128i* in,
+ __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b2: 04 14 24 34 05 15 25 35
+ // b4: 02 12 22 32 03 13 23 33
+ // b6: 06 16 26 36 07 17 27 37
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 XX XX XX XX
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 XX XX XX XX
+ // out[3]: 03 13 23 33 XX XX XX XX
+ // out[4]: 04 14 24 34 XX XX XX XX
+ // out[5]: 05 15 25 35 XX XX XX XX
+ // out[6]: 06 16 26 36 XX XX XX XX
+ // out[7]: 07 17 27 37 XX XX XX XX
+ const __m128i zeros = _mm_setzero_si128();
+ out[0] = _mm_unpacklo_epi64(b0, zeros);
+ out[1] = _mm_unpackhi_epi64(b0, zeros);
+ out[2] = _mm_unpacklo_epi64(b4, zeros);
+ out[3] = _mm_unpackhi_epi64(b4, zeros);
+ out[4] = _mm_unpacklo_epi64(b2, zeros);
+ out[5] = _mm_unpackhi_epi64(b2, zeros);
+ out[6] = _mm_unpacklo_epi64(b6, zeros);
+ out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8_U16(const __m128i* const in,
+ __m128i* const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
new file mode 100644
index 0000000..43279ab
--- /dev/null
+++ b/src/dsp/x86/warp_sse4.cc
@@ -0,0 +1,525 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// This assumes the two filters contain filter[x] and filter[x+2].
+inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
+ const __m128i filter_1,
+ const __m128i& src_window) {
+ const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
+ const __m128i src =
+ _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
+ return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
+}
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+ (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+inline void HorizontalFilter(const int sx4, const int16_t alpha,
+ const __m128i src_row,
+ int16_t intermediate_result_row[8]) {
+ int sx = sx4 - MultiplyBy4(alpha);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadLo8(kWarpedFilters8[offset]);
+ sx += alpha;
+ }
+ Transpose8x8To4x16_U8(filter, filter);
+ // |filter| now contains two filters per register.
+ // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
+ // without overflowing the sign bit. The sign bit is hit only where two taps
+ // paired in a single madd add up to more than 128. This is only possible with
+ // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
+ // even guarantees safety. |sum| is given a negative offset to allow for large
+ // intermediate values.
+ // k = 0, 2.
+ __m128i src_row_window = src_row;
+ __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
+ sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
+
+ // k = 1, 3.
+ src_row_window = _mm_srli_si128(src_row_window, 1);
+ sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
+ _mm_srli_si128(filter[1], 8), src_row_window);
+ // k = 4, 6.
+ src_row_window = _mm_srli_si128(src_row_window, 3);
+ sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
+
+ // k = 5, 7.
+ src_row_window = _mm_srli_si128(src_row_window, 1);
+ sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
+ _mm_srli_si128(filter[3], 8), src_row_window);
+
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
+ StoreUnaligned16(intermediate_result_row, sum);
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+ const int16_t intermediate_result[15][8], int y,
+ void* dst_row) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
+ __m128i sum_high = sum_low;
+ for (int k = 0; k < 8; k += 2) {
+ const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+ const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+ const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
+ const __m128i intermediate_1 =
+ LoadUnaligned16(intermediate_result[y + k + 1]);
+ const __m128i intermediate_low =
+ _mm_unpacklo_epi16(intermediate_0, intermediate_1);
+ const __m128i intermediate_high =
+ _mm_unpackhi_epi16(intermediate_0, intermediate_1);
+
+ const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
+ const __m128i product_high =
+ _mm_madd_epi16(filters_high, intermediate_high);
+ sum_low = _mm_add_epi32(sum_low, product_low);
+ sum_high = _mm_add_epi32(sum_high, product_high);
+ }
+ sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+ sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+ if (is_compound) {
+ const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+ StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+ } else {
+ const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+ StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+ }
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+ const int16_t* intermediate_result_column,
+ void* dst_row) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ __m128i sum_low = _mm_setzero_si128();
+ __m128i sum_high = _mm_setzero_si128();
+ for (int k = 0; k < 8; k += 2) {
+ const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+ const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+ // Equivalent to unpacking two vectors made by duplicating int16_t values.
+ const __m128i intermediate =
+ _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
+ intermediate_result_column[k]);
+ const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
+ const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
+ sum_low = _mm_add_epi32(sum_low, product_low);
+ sum_high = _mm_add_epi32(sum_high, product_high);
+ }
+ sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+ sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+ if (is_compound) {
+ const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+ StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+ } else {
+ const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+ StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+ int delta, DestType* dest_row,
+ ptrdiff_t dest_stride) {
+ int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadUnaligned16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8_U16(filter, filter);
+ WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
+ dest_row += dest_stride;
+ sy4 += delta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma,
+ int delta, DestType* dest_row,
+ ptrdiff_t dest_stride) {
+ int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadUnaligned16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8_U16(filter, filter);
+ WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
+ dest_row += dest_stride;
+ sy4 += delta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int source_height, int ix4, int iy4,
+ DestType* dst_row, ptrdiff_t dest_stride) {
+ // Region 1
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t row_border_pixel = first_row_border[row * source_stride];
+
+ if (is_compound) {
+ const __m128i sum =
+ _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
+ kInterRoundBitsCompoundVertical));
+ StoreUnaligned16(dst_row, sum);
+ } else {
+ memset(dst_row, row_border_pixel, 8);
+ }
+ const DestType* const first_dst_row = dst_row;
+ dst_row += dest_stride;
+ for (int y = 1; y < 8; ++y) {
+ memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+ dst_row += dest_stride;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int y4, int ix4, int iy4, int gamma,
+ int delta, int16_t intermediate_result_column[15],
+ DestType* dst_row, ptrdiff_t dest_stride) {
+ // Region 2.
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ int sum = first_row_border[row * source_stride];
+ sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Region 2 vertical filter.
+ VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
+ delta, dst_row, dest_stride);
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride,
+ int source_height, int alpha, int beta, int x4, int ix4,
+ int iy4, int16_t intermediate_result[15][8]) {
+ // Region 3
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Horizontal filter.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha,
+ int beta, int x4, int ix4, int iy4,
+ int16_t intermediate_result[15][8]) {
+ // Region 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Horizontal filter.
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+ // Convert src_row_v to int8 (subtract 128).
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int source_height,
+ const int* warp_params, int subsampling_x,
+ int subsampling_y, int src_x, int src_y,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta, DestType* dst_row,
+ ptrdiff_t dest_stride) {
+ union {
+ // Intermediate_result is the output of the horizontal filtering and
+ // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+ // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+ // type so that we can start with a negative offset and restore it on the
+ // final filter sum.
+ int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
+ // In the simple special cases where the samples in each row are all the
+ // same, store one sample per row in a column vector.
+ int16_t intermediate_result_column[15];
+ };
+
+ const int dst_x =
+ src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+ const int dst_y =
+ src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+ const int x4 = dst_x >> subsampling_x;
+ const int y4 = dst_y >> subsampling_y;
+ const int ix4 = x4 >> kWarpedModelPrecisionBits;
+ const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ // Outside the frame in both directions. One repeated value.
+ WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
+ source_height, ix4, iy4, dst_row,
+ dest_stride);
+ return;
+ }
+ // Outside the frame horizontally. Rows repeated.
+ WarpRegion2<is_compound, DestType>(
+ src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
+ intermediate_result_column, dst_row, dest_stride);
+ return;
+ }
+
+ if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ // Outside the frame vertically.
+ WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
+ beta, x4, ix4, iy4, intermediate_result);
+ } else {
+ // Inside the frame.
+ WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
+ iy4, intermediate_result);
+ }
+ // Region 3 and 4 vertical filter.
+ VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
+ dst_row, dest_stride);
+}
+
+template <bool is_compound>
+void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
+ int source_height, const int* warp_params, int subsampling_x,
+ int subsampling_y, int block_start_x, int block_start_y,
+ int block_width, int block_height, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta, void* dest,
+ ptrdiff_t dest_stride) {
+ const auto* const src = static_cast<const uint8_t*>(source);
+ using DestType =
+ typename std::conditional<is_compound, int16_t, uint8_t>::type;
+ auto* dst = static_cast<DestType*>(dest);
+
+ // Warp process applies for each 8x8 block.
+ assert(block_width >= 8);
+ assert(block_height >= 8);
+ const int block_end_x = block_start_x + block_width;
+ const int block_end_y = block_start_y + block_height;
+
+ const int start_x = block_start_x;
+ const int start_y = block_start_y;
+ int src_x = (start_x + 4) << subsampling_x;
+ int src_y = (start_y + 4) << subsampling_y;
+ const int end_x = (block_end_x + 4) << subsampling_x;
+ const int end_y = (block_end_y + 4) << subsampling_y;
+ do {
+ DestType* dst_row = dst;
+ src_x = (start_x + 4) << subsampling_x;
+ do {
+ HandleWarpBlock<is_compound, DestType>(
+ src, source_stride, source_width, source_height, warp_params,
+ subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
+ dst_row, dest_stride);
+ src_x += (8 << subsampling_x);
+ dst_row += 8;
+ } while (src_x < end_x);
+ dst += 8 * dest_stride;
+ src_y += (8 << subsampling_y);
+ } while (src_y < end_y);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
+ dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/warp_sse4.h b/src/dsp/x86/warp_sse4.h
new file mode 100644
index 0000000..a2dc5ca
--- /dev/null
+++ b/src/dsp/x86/warp_sse4.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Warp
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
new file mode 100644
index 0000000..dfd5662
--- /dev/null
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -0,0 +1,464 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/weight_mask_sse4.h"
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kRoundingBits8bpp = 4;
+
+template <bool mask_is_inverse>
+inline void WeightMask8_SSE4(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* mask) {
+ const __m128i pred_0 = LoadAligned16(prediction_0);
+ const __m128i pred_1 = LoadAligned16(prediction_1);
+ const __m128i difference = RightShiftWithRounding_U16(
+ _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp);
+ const __m128i scaled_difference = _mm_srli_epi16(difference, 4);
+ const __m128i difference_offset = _mm_set1_epi8(38);
+ const __m128i adjusted_difference =
+ _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference),
+ difference_offset);
+ const __m128i mask_ceiling = _mm_set1_epi8(64);
+ const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
+ if (mask_is_inverse) {
+ const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+ StoreLo8(mask, inverted_mask_value);
+ } else {
+ StoreLo8(mask, mask_value);
+ }
+}
+
+#define WEIGHT8_WITHOUT_STRIDE \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask)
+
+#define WEIGHT8_AND_STRIDE \
+ WEIGHT8_WITHOUT_STRIDE; \
+ pred_0 += 8; \
+ pred_1 += 8; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ } while (++y < 7);
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+
+#define WEIGHT16_AND_STRIDE \
+ WEIGHT16_WITHOUT_STRIDE; \
+ pred_0 += 16; \
+ pred_1 += 16; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ } while (++y < 7);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+
+#define WEIGHT32_AND_STRIDE \
+ WEIGHT32_WITHOUT_STRIDE; \
+ pred_0 += 32; \
+ pred_1 += 32; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+
+#define WEIGHT64_AND_STRIDE \
+ WEIGHT64_WITHOUT_STRIDE; \
+ pred_0 += 64; \
+ pred_1 += 64; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 42);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 42);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_SSE4<0>; \
+ dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1>
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h
new file mode 100644
index 0000000..07636b7
--- /dev/null
+++ b/src/dsp/x86/weight_mask_sse4.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
diff --git a/src/film_grain.cc b/src/film_grain.cc
new file mode 100644
index 0000000..dac37b5
--- /dev/null
+++ b/src/film_grain.cc
@@ -0,0 +1,817 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+namespace {
+
+// The kGaussianSequence array contains random samples from a Gaussian
+// distribution with zero mean and standard deviation of about 512 clipped to
+// the range of [-2048, 2047] (representable by a signed integer using 12 bits
+// of precision) and rounded to the nearest multiple of 4.
+//
+// Note: It is important that every element in the kGaussianSequence array be
+// less than 2040, so that RightShiftWithRounding(kGaussianSequence[i], 4) is
+// less than 128 for bitdepth=8 (GrainType=int8_t).
+constexpr int16_t kGaussianSequence[/*2048*/] = {
+ 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
+ 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
+ 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
+ -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
+ 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
+ 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
+ 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
+ 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
+ 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
+ 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
+ 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
+ -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
+ 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
+ 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
+ -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
+ -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
+ -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
+ -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
+ 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
+ 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
+ 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
+ -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
+ -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
+ -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
+ 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
+ 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
+ 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
+ -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
+ 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
+ -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
+ 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
+ -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
+ 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
+ -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
+ -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
+ -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
+ -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
+ -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
+ 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
+ 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
+ -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
+ -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
+ 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
+ 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
+ -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
+ 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
+ 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
+ -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
+ 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
+ -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
+ 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
+ -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
+ -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
+ 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
+ -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
+ -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
+ 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
+ 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
+ -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
+ 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
+ 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
+ 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
+ -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
+ -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
+ -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
+ 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
+ -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
+ -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
+ -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
+ -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
+ -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
+ 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
+ -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
+ -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
+ 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
+ -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
+ -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
+ -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
+ 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
+ -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
+ 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
+ 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
+ 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
+ -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
+ -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
+ 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
+ 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
+ -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
+ -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
+ -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
+ -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
+ 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
+ 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
+ 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
+ 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
+ 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
+ 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
+ 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
+ -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
+ 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
+ -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
+ -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
+ -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
+ 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
+ -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
+ -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
+ 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
+ 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
+ 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
+ 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
+ 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
+ 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
+ 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
+ -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
+ -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
+ -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
+ 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
+ -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
+ -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
+ 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
+ -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
+ 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
+ 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
+ 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
+ -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
+ 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
+ -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
+ 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
+ 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
+ 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
+ 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
+ -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
+ -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
+ 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
+ -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
+ 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
+ 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
+ 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
+ -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
+ -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
+ 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
+ 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
+ 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
+ -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
+ -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
+ 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
+ -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
+ -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
+ -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
+ 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
+ -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
+ 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
+ -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
+ 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
+ -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
+ 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
+ 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
+ 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
+ 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
+ -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
+ -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
+ -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
+ -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
+ 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
+ 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
+ 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
+ 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
+ -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
+ 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
+ -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
+ 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
+ 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
+ -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
+ -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
+ -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
+ -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
+ 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
+ -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
+ -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
+ -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
+ -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
+ 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
+ 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
+ -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
+ -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
+ 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
+ 428, -484};
+static_assert(sizeof(kGaussianSequence) / sizeof(kGaussianSequence[0]) == 2048,
+ "");
+
+// The number of rows in a contiguous group computed by a single worker thread
+// before checking for the next available group.
+constexpr int kFrameChunkHeight = 8;
+
+// |width| and |height| refer to the plane, not the frame, meaning any
+// subsampling should be applied by the caller.
+template <typename Pixel>
+inline void CopyImagePlane(const uint8_t* source_plane, ptrdiff_t source_stride,
+ int width, int height, uint8_t* dest_plane,
+ ptrdiff_t dest_stride) {
+ // If it's the same buffer there's nothing to do.
+ if (source_plane == dest_plane) return;
+
+ int y = 0;
+ do {
+ memcpy(dest_plane, source_plane, width * sizeof(Pixel));
+ source_plane += source_stride;
+ dest_plane += dest_stride;
+ } while (++y < height);
+}
+
+} // namespace
+
+template <int bitdepth>
+FilmGrain<bitdepth>::FilmGrain(const FilmGrainParams& params,
+ bool is_monochrome,
+ bool color_matrix_is_identity, int subsampling_x,
+ int subsampling_y, int width, int height,
+ ThreadPool* thread_pool)
+ : params_(params),
+ is_monochrome_(is_monochrome),
+ color_matrix_is_identity_(color_matrix_is_identity),
+ subsampling_x_(subsampling_x),
+ subsampling_y_(subsampling_y),
+ width_(width),
+ height_(height),
+ template_uv_width_((subsampling_x != 0) ? kMinChromaWidth
+ : kMaxChromaWidth),
+ template_uv_height_((subsampling_y != 0) ? kMinChromaHeight
+ : kMaxChromaHeight),
+ thread_pool_(thread_pool) {}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::Init() {
+ // Section 7.18.3.3. Generate grain process.
+ const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+ // If params_.num_y_points is 0, luma_grain_ will never be read, so we don't
+ // need to generate it.
+ const bool use_luma = params_.num_y_points > 0;
+ if (use_luma) {
+ GenerateLumaGrain(params_, luma_grain_);
+ // If params_.auto_regression_coeff_lag is 0, the filter is the identity
+ // filter and therefore can be skipped.
+ if (params_.auto_regression_coeff_lag > 0) {
+ dsp.film_grain
+ .luma_auto_regression[params_.auto_regression_coeff_lag - 1](
+ params_, luma_grain_);
+ }
+ } else {
+ // Have AddressSanitizer warn if luma_grain_ is used.
+ ASAN_POISON_MEMORY_REGION(luma_grain_, sizeof(luma_grain_));
+ }
+ if (!is_monochrome_) {
+ GenerateChromaGrains(params_, template_uv_width_, template_uv_height_,
+ u_grain_, v_grain_);
+ if (params_.auto_regression_coeff_lag > 0 || use_luma) {
+ dsp.film_grain.chroma_auto_regression[static_cast<int>(
+ use_luma)][params_.auto_regression_coeff_lag](
+ params_, luma_grain_, subsampling_x_, subsampling_y_, u_grain_,
+ v_grain_);
+ }
+ }
+
+ // Section 7.18.3.4. Scaling lookup initialization process.
+
+ // Initialize scaling_lut_y_. If params_.num_y_points > 0, scaling_lut_y_
+ // is used for the Y plane. If params_.chroma_scaling_from_luma is true,
+ // scaling_lut_u_ and scaling_lut_v_ are the same as scaling_lut_y_ and are
+ // set up as aliases. So we need to initialize scaling_lut_y_ under these
+ // two conditions.
+ //
+ // Note: Although it does not seem to make sense, there are test vectors
+ // with chroma_scaling_from_luma=true and params_.num_y_points=0.
+ if (use_luma || params_.chroma_scaling_from_luma) {
+ dsp.film_grain.initialize_scaling_lut(
+ params_.num_y_points, params_.point_y_value, params_.point_y_scaling,
+ scaling_lut_y_);
+ } else {
+ ASAN_POISON_MEMORY_REGION(scaling_lut_y_, sizeof(scaling_lut_y_));
+ }
+ if (!is_monochrome_) {
+ if (params_.chroma_scaling_from_luma) {
+ scaling_lut_u_ = scaling_lut_y_;
+ scaling_lut_v_ = scaling_lut_y_;
+ } else if (params_.num_u_points > 0 || params_.num_v_points > 0) {
+ const size_t buffer_size =
+ (kScalingLookupTableSize + kScalingLookupTablePadding) *
+ (static_cast<int>(params_.num_u_points > 0) +
+ static_cast<int>(params_.num_v_points > 0));
+ scaling_lut_chroma_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+ if (scaling_lut_chroma_buffer_ == nullptr) return false;
+
+ uint8_t* buffer = scaling_lut_chroma_buffer_.get();
+ if (params_.num_u_points > 0) {
+ scaling_lut_u_ = buffer;
+ dsp.film_grain.initialize_scaling_lut(
+ params_.num_u_points, params_.point_u_value,
+ params_.point_u_scaling, scaling_lut_u_);
+ buffer += kScalingLookupTableSize + kScalingLookupTablePadding;
+ }
+ if (params_.num_v_points > 0) {
+ scaling_lut_v_ = buffer;
+ dsp.film_grain.initialize_scaling_lut(
+ params_.num_v_points, params_.point_v_value,
+ params_.point_v_scaling, scaling_lut_v_);
+ }
+ }
+ }
+ return true;
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateLumaGrain(const FilmGrainParams& params,
+ GrainType* luma_grain) {
+ // If params.num_y_points is equal to 0, Section 7.18.3.3 specifies we set
+ // the luma_grain array to all zeros. But the Note at the end of Section
+ // 7.18.3.3 says luma_grain "will never be read in this case". So we don't
+ // call GenerateLumaGrain if params.num_y_points is equal to 0.
+ assert(params.num_y_points > 0);
+ const int shift = 12 - bitdepth + params.grain_scale_shift;
+ uint16_t seed = params.grain_seed;
+ GrainType* luma_grain_row = luma_grain;
+ for (int y = 0; y < kLumaHeight; ++y) {
+ for (int x = 0; x < kLumaWidth; ++x) {
+ luma_grain_row[x] = RightShiftWithRounding(
+ kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+ }
+ luma_grain_row += kLumaWidth;
+ }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateChromaGrains(const FilmGrainParams& params,
+ int chroma_width,
+ int chroma_height,
+ GrainType* u_grain,
+ GrainType* v_grain) {
+ const int shift = 12 - bitdepth + params.grain_scale_shift;
+ if (params.num_u_points == 0 && !params.chroma_scaling_from_luma) {
+ memset(u_grain, 0, chroma_height * chroma_width * sizeof(*u_grain));
+ } else {
+ uint16_t seed = params.grain_seed ^ 0xb524;
+ GrainType* u_grain_row = u_grain;
+ assert(chroma_width > 0);
+ assert(chroma_height > 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ u_grain_row[x] = RightShiftWithRounding(
+ kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+ } while (++x < chroma_width);
+
+ u_grain_row += chroma_width;
+ } while (++y < chroma_height);
+ }
+ if (params.num_v_points == 0 && !params.chroma_scaling_from_luma) {
+ memset(v_grain, 0, chroma_height * chroma_width * sizeof(*v_grain));
+ } else {
+ GrainType* v_grain_row = v_grain;
+ uint16_t seed = params.grain_seed ^ 0x49d8;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ v_grain_row[x] = RightShiftWithRounding(
+ kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+ } while (++x < chroma_width);
+
+ v_grain_row += chroma_width;
+ } while (++y < chroma_height);
+ }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseStripes() {
+ const int half_height = DivideBy2(height_ + 1);
+ assert(half_height > 0);
+ // ceil(half_height / 16.0)
+ const int max_luma_num = DivideBy16(half_height + 15);
+ constexpr int kNoiseStripeHeight = 34;
+ size_t noise_buffer_size = kNoiseStripePadding;
+ if (params_.num_y_points > 0) {
+ noise_buffer_size += max_luma_num * kNoiseStripeHeight * width_;
+ }
+ if (!is_monochrome_) {
+ noise_buffer_size += 2 * max_luma_num *
+ (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_);
+ }
+ noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]);
+ if (noise_buffer_ == nullptr) return false;
+ GrainType* noise_buffer = noise_buffer_.get();
+ if (params_.num_y_points > 0) {
+ noise_stripes_[kPlaneY].Reset(max_luma_num, kNoiseStripeHeight * width_,
+ noise_buffer);
+ noise_buffer += max_luma_num * kNoiseStripeHeight * width_;
+ }
+ if (!is_monochrome_) {
+ noise_stripes_[kPlaneU].Reset(max_luma_num,
+ (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_),
+ noise_buffer);
+ noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_);
+ noise_stripes_[kPlaneV].Reset(max_luma_num,
+ (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_),
+ noise_buffer);
+ }
+ return true;
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseImage() {
+ if (params_.num_y_points > 0 &&
+ !noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding,
+ /*zero_initialize=*/false)) {
+ return false;
+ }
+ if (!is_monochrome_) {
+ if (!noise_image_[kPlaneU].Reset(
+ (height_ + subsampling_y_) >> subsampling_y_,
+ ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+ /*zero_initialize=*/false)) {
+ return false;
+ }
+ if (!noise_image_[kPlaneV].Reset(
+ (height_ + subsampling_y_) >> subsampling_y_,
+ ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+ /*zero_initialize=*/false)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Uses |overlap_flag| to skip rows that are covered by the overlap computation.
+template <int bitdepth>
+void FilmGrain<bitdepth>::ConstructNoiseImage(
+ const Array2DView<GrainType>* noise_stripes, int width, int height,
+ int subsampling_x, int subsampling_y, int stripe_start_offset,
+ Array2D<GrainType>* noise_image) {
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ const int plane_height = (height + subsampling_y) >> subsampling_y;
+ const int stripe_height = 32 >> subsampling_y;
+ const int stripe_mask = stripe_height - 1;
+ int y = 0;
+ // |luma_num| = y >> (5 - |subsampling_y|). Hence |luma_num| == 0 for all y up
+ // to either 16 or 32.
+ const GrainType* first_noise_stripe = (*noise_stripes)[0];
+ do {
+ memcpy((*noise_image)[y], first_noise_stripe + y * plane_width,
+ plane_width * sizeof(first_noise_stripe[0]));
+ } while (++y < std::min(stripe_height, plane_height));
+ // End special iterations for luma_num == 0.
+
+ int luma_num = 1;
+ for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ int i = stripe_start_offset;
+ do {
+ memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+ plane_width * sizeof(noise_stripe[0]));
+ } while (++i < stripe_height);
+ }
+
+ // If there is a partial stripe, copy any rows beyond the overlap rows.
+ const int remaining_height = plane_height - y;
+ if (remaining_height > stripe_start_offset) {
+ assert(luma_num < noise_stripes->rows());
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ int i = stripe_start_offset;
+ do {
+ memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+ plane_width * sizeof(noise_stripe[0]));
+ } while (++i < remaining_height);
+ }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseChromaWorker(
+ const dsp::Dsp& dsp, const Plane* planes, int num_planes,
+ std::atomic<int>* job_counter, int min_value, int max_chroma,
+ const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+ const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+ ptrdiff_t source_stride_uv, uint8_t* dest_plane_u, uint8_t* dest_plane_v,
+ ptrdiff_t dest_stride_uv) {
+ assert(num_planes > 0);
+ const int full_jobs_per_plane = height_ / kFrameChunkHeight;
+ const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+ const int total_full_jobs = full_jobs_per_plane * num_planes;
+ // If the frame height is not a multiple of kFrameChunkHeight, one job with
+ // a smaller number of rows is necessary at the end of each plane.
+ const int total_jobs =
+ total_full_jobs + ((remainder_job_height == 0) ? 0 : num_planes);
+ int job_index;
+ // Each job corresponds to a slice of kFrameChunkHeight rows in the luma
+ // plane. dsp->blend_noise_chroma handles subsampling.
+ // This loop body handles a slice of one plane or the other, depending on
+ // which are active. That way, threads working on consecutive jobs will keep
+ // the same region of luma source in working memory.
+ while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+ total_jobs) {
+ const Plane plane = planes[job_index % num_planes];
+ const int slice_index = job_index / num_planes;
+ const int start_height = slice_index * kFrameChunkHeight;
+ const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+ const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+ source_plane_y + start_height * source_stride_y);
+ const uint8_t* scaling_lut_uv;
+ const uint8_t* source_plane_uv;
+ uint8_t* dest_plane_uv;
+
+ if (plane == kPlaneU) {
+ scaling_lut_uv = scaling_lut_u_;
+ source_plane_uv = source_plane_u;
+ dest_plane_uv = dest_plane_u;
+ } else {
+ assert(plane == kPlaneV);
+ scaling_lut_uv = scaling_lut_v_;
+ source_plane_uv = source_plane_v;
+ dest_plane_uv = dest_plane_v;
+ }
+ const auto* source_cursor_uv = reinterpret_cast<const Pixel*>(
+ source_plane_uv + (start_height >> subsampling_y_) * source_stride_uv);
+ auto* dest_cursor_uv = reinterpret_cast<Pixel*>(
+ dest_plane_uv + (start_height >> subsampling_y_) * dest_stride_uv);
+ dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+ plane, params_, noise_image_, min_value, max_chroma, width_, job_height,
+ start_height, subsampling_x_, subsampling_y_, scaling_lut_uv,
+ source_cursor_y, source_stride_y, source_cursor_uv, source_stride_uv,
+ dest_cursor_uv, dest_stride_uv);
+ }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseLumaWorker(
+ const dsp::Dsp& dsp, std::atomic<int>* job_counter, int min_value,
+ int max_luma, const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+ uint8_t* dest_plane_y, ptrdiff_t dest_stride_y) {
+ const int total_full_jobs = height_ / kFrameChunkHeight;
+ const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+ const int total_jobs =
+ total_full_jobs + static_cast<int>(remainder_job_height > 0);
+ int job_index;
+ // Each job is some number of rows in a plane.
+ while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+ total_jobs) {
+ const int start_height = job_index * kFrameChunkHeight;
+ const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+ const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+ source_plane_y + start_height * source_stride_y);
+ auto* dest_cursor_y =
+ reinterpret_cast<Pixel*>(dest_plane_y + start_height * dest_stride_y);
+ dsp.film_grain.blend_noise_luma(
+ noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+ job_height, start_height, scaling_lut_y_, source_cursor_y,
+ source_stride_y, dest_cursor_y, dest_stride_y);
+ }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AddNoise(
+ const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+ const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+ ptrdiff_t source_stride_uv, uint8_t* dest_plane_y, ptrdiff_t dest_stride_y,
+ uint8_t* dest_plane_u, uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv) {
+ if (!Init()) {
+ LIBGAV1_DLOG(ERROR, "Init() failed.");
+ return false;
+ }
+ if (!AllocateNoiseStripes()) {
+ LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
+ return false;
+ }
+
+ const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+ const bool use_luma = params_.num_y_points > 0;
+
+ // Construct noise stripes.
+ if (use_luma) {
+ // The luma plane is never subsampled.
+ dsp.film_grain
+ .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+ luma_grain_, params_.grain_seed, width_, height_,
+ /*subsampling_x=*/0, /*subsampling_y=*/0, &noise_stripes_[kPlaneY]);
+ }
+ if (!is_monochrome_) {
+ dsp.film_grain
+ .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+ u_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+ subsampling_y_, &noise_stripes_[kPlaneU]);
+ dsp.film_grain
+ .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+ v_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+ subsampling_y_, &noise_stripes_[kPlaneV]);
+ }
+
+ if (!AllocateNoiseImage()) {
+ LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
+ return false;
+ }
+
+ // Construct noise image.
+ if (use_luma) {
+ ConstructNoiseImage(
+ &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+ /*subsampling_y=*/0, static_cast<int>(params_.overlap_flag) << 1,
+ &noise_image_[kPlaneY]);
+ if (params_.overlap_flag) {
+ dsp.film_grain.construct_noise_image_overlap(
+ &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+ /*subsampling_y=*/0, &noise_image_[kPlaneY]);
+ }
+ }
+ if (!is_monochrome_) {
+ ConstructNoiseImage(&noise_stripes_[kPlaneU], width_, height_,
+ subsampling_x_, subsampling_y_,
+ static_cast<int>(params_.overlap_flag)
+ << (1 - subsampling_y_),
+ &noise_image_[kPlaneU]);
+ ConstructNoiseImage(&noise_stripes_[kPlaneV], width_, height_,
+ subsampling_x_, subsampling_y_,
+ static_cast<int>(params_.overlap_flag)
+ << (1 - subsampling_y_),
+ &noise_image_[kPlaneV]);
+ if (params_.overlap_flag) {
+ dsp.film_grain.construct_noise_image_overlap(
+ &noise_stripes_[kPlaneU], width_, height_, subsampling_x_,
+ subsampling_y_, &noise_image_[kPlaneU]);
+ dsp.film_grain.construct_noise_image_overlap(
+ &noise_stripes_[kPlaneV], width_, height_, subsampling_x_,
+ subsampling_y_, &noise_image_[kPlaneV]);
+ }
+ }
+
+ // Blend noise image.
+ int min_value;
+ int max_luma;
+ int max_chroma;
+ if (params_.clip_to_restricted_range) {
+ min_value = 16 << (bitdepth - 8);
+ max_luma = 235 << (bitdepth - 8);
+ if (color_matrix_is_identity_) {
+ max_chroma = max_luma;
+ } else {
+ max_chroma = 240 << (bitdepth - 8);
+ }
+ } else {
+ min_value = 0;
+ max_luma = (256 << (bitdepth - 8)) - 1;
+ max_chroma = max_luma;
+ }
+
+ // Handle all chroma planes first because luma source may be altered in place.
+ if (!is_monochrome_) {
+ // This is done in a strange way but Vector can't be passed by copy to the
+ // lambda capture that spawns the thread.
+ Plane planes_to_blend[2];
+ int num_planes = 0;
+ if (params_.chroma_scaling_from_luma) {
+ // Both noise planes are computed from the luma scaling lookup table.
+ planes_to_blend[num_planes++] = kPlaneU;
+ planes_to_blend[num_planes++] = kPlaneV;
+ } else {
+ const int height_uv = SubsampledValue(height_, subsampling_y_);
+ const int width_uv = SubsampledValue(width_, subsampling_x_);
+
+ // Noise is applied according to a lookup table defined by pieceiwse
+ // linear "points." If the lookup table is empty, that corresponds to
+ // outputting zero noise.
+ if (params_.num_u_points == 0) {
+ CopyImagePlane<Pixel>(source_plane_u, source_stride_uv, width_uv,
+ height_uv, dest_plane_u, dest_stride_uv);
+ } else {
+ planes_to_blend[num_planes++] = kPlaneU;
+ }
+ if (params_.num_v_points == 0) {
+ CopyImagePlane<Pixel>(source_plane_v, source_stride_uv, width_uv,
+ height_uv, dest_plane_v, dest_stride_uv);
+ } else {
+ planes_to_blend[num_planes++] = kPlaneV;
+ }
+ }
+ if (thread_pool_ != nullptr && num_planes > 0) {
+ const int num_workers = thread_pool_->num_threads();
+ BlockingCounter pending_workers(num_workers);
+ std::atomic<int> job_counter(0);
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool_->Schedule([this, dsp, &pending_workers, &planes_to_blend,
+ num_planes, &job_counter, min_value, max_chroma,
+ source_plane_y, source_stride_y, source_plane_u,
+ source_plane_v, source_stride_uv, dest_plane_u,
+ dest_plane_v, dest_stride_uv]() {
+ BlendNoiseChromaWorker(dsp, planes_to_blend, num_planes, &job_counter,
+ min_value, max_chroma, source_plane_y,
+ source_stride_y, source_plane_u,
+ source_plane_v, source_stride_uv, dest_plane_u,
+ dest_plane_v, dest_stride_uv);
+ pending_workers.Decrement();
+ });
+ }
+ BlendNoiseChromaWorker(
+ dsp, planes_to_blend, num_planes, &job_counter, min_value, max_chroma,
+ source_plane_y, source_stride_y, source_plane_u, source_plane_v,
+ source_stride_uv, dest_plane_u, dest_plane_v, dest_stride_uv);
+
+ pending_workers.Wait();
+ } else {
+ // Single threaded.
+ if (params_.num_u_points > 0 || params_.chroma_scaling_from_luma) {
+ dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+ kPlaneU, params_, noise_image_, min_value, max_chroma, width_,
+ height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+ scaling_lut_u_, source_plane_y, source_stride_y, source_plane_u,
+ source_stride_uv, dest_plane_u, dest_stride_uv);
+ }
+ if (params_.num_v_points > 0 || params_.chroma_scaling_from_luma) {
+ dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+ kPlaneV, params_, noise_image_, min_value, max_chroma, width_,
+ height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+ scaling_lut_v_, source_plane_y, source_stride_y, source_plane_v,
+ source_stride_uv, dest_plane_v, dest_stride_uv);
+ }
+ }
+ }
+ if (use_luma) {
+ if (thread_pool_ != nullptr) {
+ const int num_workers = thread_pool_->num_threads();
+ BlockingCounter pending_workers(num_workers);
+ std::atomic<int> job_counter(0);
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool_->Schedule(
+ [this, dsp, &pending_workers, &job_counter, min_value, max_luma,
+ source_plane_y, source_stride_y, dest_plane_y, dest_stride_y]() {
+ BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+ source_plane_y, source_stride_y,
+ dest_plane_y, dest_stride_y);
+ pending_workers.Decrement();
+ });
+ }
+
+ BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+ source_plane_y, source_stride_y, dest_plane_y,
+ dest_stride_y);
+ pending_workers.Wait();
+ } else {
+ dsp.film_grain.blend_noise_luma(
+ noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+ height_, /*start_height=*/0, scaling_lut_y_, source_plane_y,
+ source_stride_y, dest_plane_y, dest_stride_y);
+ }
+ } else {
+ CopyImagePlane<Pixel>(source_plane_y, source_stride_y, width_, height_,
+ dest_plane_y, dest_stride_y);
+ }
+
+ return true;
+}
+
+// Explicit instantiations.
+template class FilmGrain<8>;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template class FilmGrain<10>;
+#endif
+
+} // namespace libgav1
diff --git a/src/film_grain.h b/src/film_grain.h
new file mode 100644
index 0000000..b588f6d
--- /dev/null
+++ b/src/film_grain.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FILM_GRAIN_H_
+#define LIBGAV1_SRC_FILM_GRAIN_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// Film grain synthesis function signature. Section 7.18.3.
+// This function generates film grain noise and blends the noise with the
+// decoded frame.
+// |source_plane_y|, |source_plane_u|, and |source_plane_v| are the plane
+// buffers of the decoded frame. They are blended with the film grain noise and
+// written to |dest_plane_y|, |dest_plane_u|, and |dest_plane_v| as final
+// output for display. |source_plane_p| and |dest_plane_p| (where p is y, u, or
+// v) may point to the same buffer, in which case the film grain noise is added
+// in place.
+// |film_grain_params| are parameters read from frame header.
+// |is_monochrome| is true indicates only Y plane needs to be processed.
+// |color_matrix_is_identity| is true if the matrix_coefficients field in the
+// sequence header's color config is is MC_IDENTITY.
+// |width| is the upscaled width of the frame.
+// |height| is the frame height.
+// |subsampling_x| and |subsampling_y| are subsamplings for UV planes, not used
+// if |is_monochrome| is true.
+// Returns true on success, or false on failure (e.g., out of memory).
+using FilmGrainSynthesisFunc = bool (*)(
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_u, ptrdiff_t source_stride_u,
+ const void* source_plane_v, ptrdiff_t source_stride_v,
+ const FilmGrainParams& film_grain_params, bool is_monochrome,
+ bool color_matrix_is_identity, int width, int height, int subsampling_x,
+ int subsampling_y, void* dest_plane_y, ptrdiff_t dest_stride_y,
+ void* dest_plane_u, ptrdiff_t dest_stride_u, void* dest_plane_v,
+ ptrdiff_t dest_stride_v);
+
+// Section 7.18.3.5. Add noise synthesis process.
+template <int bitdepth>
+class FilmGrain {
+ public:
+ using GrainType =
+ typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+ FilmGrain(const FilmGrainParams& params, bool is_monochrome,
+ bool color_matrix_is_identity, int subsampling_x, int subsampling_y,
+ int width, int height, ThreadPool* thread_pool);
+
+ // Note: These static methods are declared public so that the unit tests can
+ // call them.
+
+ static void GenerateLumaGrain(const FilmGrainParams& params,
+ GrainType* luma_grain);
+
+ // Generates white noise arrays u_grain and v_grain chroma_width samples wide
+ // and chroma_height samples high.
+ static void GenerateChromaGrains(const FilmGrainParams& params,
+ int chroma_width, int chroma_height,
+ GrainType* u_grain, GrainType* v_grain);
+
+ // Copies rows from |noise_stripes| to |noise_image|, skipping rows that are
+ // subject to overlap.
+ static void ConstructNoiseImage(const Array2DView<GrainType>* noise_stripes,
+ int width, int height, int subsampling_x,
+ int subsampling_y, int stripe_start_offset,
+ Array2D<GrainType>* noise_image);
+
+ // Combines the film grain with the image data.
+ bool AddNoise(const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+ const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+ ptrdiff_t source_stride_uv, uint8_t* dest_plane_y,
+ ptrdiff_t dest_stride_y, uint8_t* dest_plane_u,
+ uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+ private:
+ using Pixel =
+ typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type;
+
+ bool Init();
+
+ // Allocates noise_stripes_.
+ bool AllocateNoiseStripes();
+
+ bool AllocateNoiseImage();
+
+ void BlendNoiseChromaWorker(const dsp::Dsp& dsp, const Plane* planes,
+ int num_planes, std::atomic<int>* job_counter,
+ int min_value, int max_chroma,
+ const uint8_t* source_plane_y,
+ ptrdiff_t source_stride_y,
+ const uint8_t* source_plane_u,
+ const uint8_t* source_plane_v,
+ ptrdiff_t source_stride_uv, uint8_t* dest_plane_u,
+ uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+ void BlendNoiseLumaWorker(const dsp::Dsp& dsp, std::atomic<int>* job_counter,
+ int min_value, int max_luma,
+ const uint8_t* source_plane_y,
+ ptrdiff_t source_stride_y, uint8_t* dest_plane_y,
+ ptrdiff_t dest_stride_y);
+
+ const FilmGrainParams& params_;
+ const bool is_monochrome_;
+ const bool color_matrix_is_identity_;
+ const int subsampling_x_;
+ const int subsampling_y_;
+ // Frame width and height.
+ const int width_;
+ const int height_;
+ // Section 7.18.3.3, Dimensions of the noise templates for chroma, which are
+ // known as CbGrain and CrGrain.
+ // These templates are used to construct the noise image for each plane by
+ // copying 32x32 blocks with pseudorandom offsets, into "noise stripes."
+ // The noise template known as LumaGrain array is an 82x73 block.
+ // The height and width of the templates for chroma become 44 and 38 under
+ // subsampling, respectively.
+ // For more details see:
+ // A. Norkin and N. Birkbeck, "Film Grain Synthesis for AV1 Video Codec," 2018
+ // Data Compression Conference, Snowbird, UT, 2018, pp. 3-12.
+ const int template_uv_width_;
+ const int template_uv_height_;
+ // LumaGrain. The luma_grain array contains white noise generated for luma.
+ // The array size is fixed but subject to further optimization for SIMD.
+ GrainType luma_grain_[kLumaHeight * kLumaWidth];
+ // CbGrain and CrGrain. The maximum size of the u_grain and v_grain arrays is
+ // kMaxChromaHeight * kMaxChromaWidth. The actual size is
+ // template_uv_height_ * template_uv_width_.
+ GrainType u_grain_[kMaxChromaHeight * kMaxChromaWidth];
+ GrainType v_grain_[kMaxChromaHeight * kMaxChromaWidth];
+ // Scaling lookup tables.
+ uint8_t scaling_lut_y_[kScalingLookupTableSize + kScalingLookupTablePadding];
+ uint8_t* scaling_lut_u_ = nullptr;
+ uint8_t* scaling_lut_v_ = nullptr;
+ // If allocated, this buffer is 256 * 2 bytes long and scaling_lut_u_ and
+ // scaling_lut_v_ point into this buffer. Otherwise, scaling_lut_u_ and
+ // scaling_lut_v_ point to scaling_lut_y_.
+ std::unique_ptr<uint8_t[]> scaling_lut_chroma_buffer_;
+
+ // A two-dimensional array of noise data for each plane. Generated for each 32
+ // luma sample high stripe of the image. The first dimension is called
+ // luma_num. The second dimension is the size of one noise stripe.
+ //
+ // Each row of the Array2DView noise_stripes_[plane] is a conceptually
+ // two-dimensional array of |GrainType|s. The two-dimensional array of
+ // |GrainType|s is flattened into a one-dimensional buffer in this
+ // implementation.
+ //
+ // noise_stripes_[kPlaneY][luma_num] is an array that has 34 rows and
+ // |width_| columns and contains noise for the luma component.
+ //
+ // noise_stripes_[kPlaneU][luma_num] or noise_stripes_[kPlaneV][luma_num]
+ // is an array that has (34 >> subsampling_y_) rows and
+ // SubsampledValue(width_, subsampling_x_) columns and contains noise for the
+ // chroma components.
+ Array2DView<GrainType> noise_stripes_[kMaxPlanes];
+ // Owns the memory that the elements of noise_stripes_ point to.
+ std::unique_ptr<GrainType[]> noise_buffer_;
+
+ Array2D<GrainType> noise_image_[kMaxPlanes];
+ ThreadPool* const thread_pool_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_FILM_GRAIN_H_
diff --git a/src/frame_buffer.cc b/src/frame_buffer.cc
new file mode 100644
index 0000000..50c7756
--- /dev/null
+++ b/src/frame_buffer.cc
@@ -0,0 +1,151 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/frame_buffer.h"
+
+#include <cstdint>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment, Libgav1FrameBufferInfo* info) {
+ switch (bitdepth) {
+ case 8:
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+#endif
+ break;
+ default:
+ return kLibgav1StatusInvalidArgument;
+ }
+ switch (image_format) {
+ case kLibgav1ImageFormatYuv420:
+ case kLibgav1ImageFormatYuv422:
+ case kLibgav1ImageFormatYuv444:
+ case kLibgav1ImageFormatMonochrome400:
+ break;
+ default:
+ return kLibgav1StatusInvalidArgument;
+ }
+ // All int arguments must be nonnegative. Borders must be a multiple of 2.
+ // |stride_alignment| must be a power of 2.
+ if ((width | height | left_border | right_border | top_border |
+ bottom_border | stride_alignment) < 0 ||
+ ((left_border | right_border | top_border | bottom_border) & 1) != 0 ||
+ (stride_alignment & (stride_alignment - 1)) != 0 || info == nullptr) {
+ return kLibgav1StatusInvalidArgument;
+ }
+
+ bool is_monochrome;
+ int8_t subsampling_x;
+ int8_t subsampling_y;
+ libgav1::DecomposeImageFormat(image_format, &is_monochrome, &subsampling_x,
+ &subsampling_y);
+
+ // Calculate y_stride (in bytes). It is padded to a multiple of
+ // |stride_alignment| bytes.
+ int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+ y_stride = libgav1::Align(y_stride, stride_alignment);
+ // Size of the Y buffer in bytes.
+ const uint64_t y_buffer_size =
+ (height + top_border + bottom_border) * static_cast<uint64_t>(y_stride) +
+ (stride_alignment - 1);
+
+ const int uv_width =
+ is_monochrome ? 0 : libgav1::SubsampledValue(width, subsampling_x);
+ const int uv_height =
+ is_monochrome ? 0 : libgav1::SubsampledValue(height, subsampling_y);
+ const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+ const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+ const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+ const int uv_bottom_border =
+ is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+ // Calculate uv_stride (in bytes). It is padded to a multiple of
+ // |stride_alignment| bytes.
+ int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+ uv_stride = libgav1::Align(uv_stride, stride_alignment);
+ // Size of the U or V buffer in bytes.
+ const uint64_t uv_buffer_size =
+ is_monochrome ? 0
+ : (uv_height + uv_top_border + uv_bottom_border) *
+ static_cast<uint64_t>(uv_stride) +
+ (stride_alignment - 1);
+
+ // Check if it is safe to cast y_buffer_size and uv_buffer_size to size_t.
+ if (y_buffer_size > SIZE_MAX || uv_buffer_size > SIZE_MAX) {
+ return kLibgav1StatusInvalidArgument;
+ }
+
+ int left_border_bytes = left_border;
+ int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) {
+ left_border_bytes *= sizeof(uint16_t);
+ uv_left_border_bytes *= sizeof(uint16_t);
+ }
+#endif
+
+ info->y_stride = y_stride;
+ info->uv_stride = uv_stride;
+ info->y_buffer_size = static_cast<size_t>(y_buffer_size);
+ info->uv_buffer_size = static_cast<size_t>(uv_buffer_size);
+ info->y_plane_offset = top_border * y_stride + left_border_bytes;
+ info->uv_plane_offset = uv_top_border * uv_stride + uv_left_border_bytes;
+ info->stride_alignment = stride_alignment;
+ return kLibgav1StatusOk;
+}
+
+Libgav1StatusCode Libgav1SetFrameBuffer(const Libgav1FrameBufferInfo* info,
+ uint8_t* y_buffer, uint8_t* u_buffer,
+ uint8_t* v_buffer,
+ void* buffer_private_data,
+ Libgav1FrameBuffer* frame_buffer) {
+ if (info == nullptr ||
+ (info->uv_buffer_size == 0 &&
+ (u_buffer != nullptr || v_buffer != nullptr)) ||
+ frame_buffer == nullptr) {
+ return kLibgav1StatusInvalidArgument;
+ }
+ if (y_buffer == nullptr || (info->uv_buffer_size != 0 &&
+ (u_buffer == nullptr || v_buffer == nullptr))) {
+ return kLibgav1StatusOutOfMemory;
+ }
+ frame_buffer->plane[0] = libgav1::AlignAddr(y_buffer + info->y_plane_offset,
+ info->stride_alignment);
+ frame_buffer->plane[1] = libgav1::AlignAddr(u_buffer + info->uv_plane_offset,
+ info->stride_alignment);
+ frame_buffer->plane[2] = libgav1::AlignAddr(v_buffer + info->uv_plane_offset,
+ info->stride_alignment);
+ frame_buffer->stride[0] = info->y_stride;
+ frame_buffer->stride[1] = frame_buffer->stride[2] = info->uv_stride;
+ frame_buffer->private_data = buffer_private_data;
+ return kLibgav1StatusOk;
+}
+
+} // extern "C"
diff --git a/src/frame_buffer_utils.h b/src/frame_buffer_utils.h
new file mode 100644
index 0000000..d41437e
--- /dev/null
+++ b/src/frame_buffer_utils.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+#define LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// The following table is from Section 6.4.2 of the spec.
+//
+// subsampling_x subsampling_y mono_chrome Description
+// -----------------------------------------------------------
+// 0 0 0 YUV 4:4:4
+// 1 0 0 YUV 4:2:2
+// 1 1 0 YUV 4:2:0
+// 1 1 1 Monochrome 4:0:0
+
+inline Libgav1ImageFormat ComposeImageFormat(bool is_monochrome,
+ int8_t subsampling_x,
+ int8_t subsampling_y) {
+ Libgav1ImageFormat image_format;
+ if (subsampling_x == 0) {
+ assert(subsampling_y == 0 && !is_monochrome);
+ image_format = kLibgav1ImageFormatYuv444;
+ } else if (subsampling_y == 0) {
+ assert(!is_monochrome);
+ image_format = kLibgav1ImageFormatYuv422;
+ } else if (!is_monochrome) {
+ image_format = kLibgav1ImageFormatYuv420;
+ } else {
+ image_format = kLibgav1ImageFormatMonochrome400;
+ }
+ return image_format;
+}
+
+inline void DecomposeImageFormat(Libgav1ImageFormat image_format,
+ bool* is_monochrome, int8_t* subsampling_x,
+ int8_t* subsampling_y) {
+ *is_monochrome = false;
+ *subsampling_x = 1;
+ *subsampling_y = 1;
+ switch (image_format) {
+ case kLibgav1ImageFormatYuv420:
+ break;
+ case kLibgav1ImageFormatYuv422:
+ *subsampling_y = 0;
+ break;
+ case kLibgav1ImageFormatYuv444:
+ *subsampling_x = *subsampling_y = 0;
+ break;
+ default:
+ assert(image_format == kLibgav1ImageFormatMonochrome400);
+ *is_monochrome = true;
+ break;
+ }
+}
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
diff --git a/src/frame_scratch_buffer.h b/src/frame_scratch_buffer.h
new file mode 100644
index 0000000..90c3bb8
--- /dev/null
+++ b/src/frame_scratch_buffer.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/loop_restoration_info.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/threading_strategy.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Buffer used to store the unfiltered pixels that are necessary for decoding
+// the next superblock row (for the intra prediction process).
+using IntraPredictionBuffer =
+ std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>;
+
+// Buffer to facilitate decoding a frame. This struct is used only within
+// DecoderImpl::DecodeTiles().
+struct FrameScratchBuffer {
+ LoopRestorationInfo loop_restoration_info;
+ Array2D<int16_t> cdef_index;
+ Array2D<TransformSize> inter_transform_sizes;
+ BlockParametersHolder block_parameters_holder;
+ TemporalMotionField motion_field;
+ SymbolDecoderContext symbol_decoder_context;
+ std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
+ // Buffer used to store the cdef borders. This buffer will store 4 rows for
+ // every 64x64 block (4 rows for every 32x32 for chroma with subsampling). The
+ // indices of the rows that are stored are specified in |kCdefBorderRows|.
+ YuvBuffer cdef_border;
+ AlignedDynamicBuffer<uint8_t, 16> superres_coefficients[kNumPlaneTypes];
+ // Buffer used to temporarily store the input row for applying SuperRes.
+ YuvBuffer superres_line_buffer;
+ // Buffer used to store the loop restoration borders. This buffer will store 4
+ // rows for every 64x64 block (4 rows for every 32x32 for chroma with
+ // subsampling). The indices of the rows that are stored are specified in
+ // |kLoopRestorationBorderRows|.
+ YuvBuffer loop_restoration_border;
+ // The size of this dynamic buffer is |tile_rows|.
+ DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
+ TileScratchBufferPool tile_scratch_buffer_pool;
+ ThreadingStrategy threading_strategy;
+ std::mutex superblock_row_mutex;
+ // The size of this buffer is the number of superblock rows.
+ // |superblock_row_progress[i]| is incremented whenever a tile finishes
+ // decoding superblock row at index i. If the count reaches tile_columns, then
+ // |superblock_row_progress_condvar[i]| is notified.
+ DynamicBuffer<int> superblock_row_progress
+ LIBGAV1_GUARDED_BY(superblock_row_mutex);
+ // The size of this buffer is the number of superblock rows. Used to wait for
+ // |superblock_row_progress[i]| to reach tile_columns.
+ DynamicBuffer<std::condition_variable> superblock_row_progress_condvar;
+ // Used to signal tile decoding failure in the combined multithreading mode.
+ bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex);
+};
+
+class FrameScratchBufferPool {
+ public:
+ std::unique_ptr<FrameScratchBuffer> Get() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (!buffers_.Empty()) {
+ return buffers_.Pop();
+ }
+ lock.unlock();
+ std::unique_ptr<FrameScratchBuffer> scratch_buffer(new (std::nothrow)
+ FrameScratchBuffer);
+ return scratch_buffer;
+ }
+
+ void Release(std::unique_ptr<FrameScratchBuffer> scratch_buffer) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ buffers_.Push(std::move(scratch_buffer));
+ }
+
+ private:
+ std::mutex mutex_;
+ Stack<std::unique_ptr<FrameScratchBuffer>, kMaxThreads> buffers_
+ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
diff --git a/src/gav1/decoder.h b/src/gav1/decoder.h
new file mode 100644
index 0000000..da08da9
--- /dev/null
+++ b/src/gav1/decoder.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_H_
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif // defined(__cplusplus)
+
+// IWYU pragma: begin_exports
+#include "gav1/decoder_buffer.h"
+#include "gav1/decoder_settings.h"
+#include "gav1/frame_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+#include "gav1/version.h"
+// IWYU pragma: end_exports
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct Libgav1Decoder;
+typedef struct Libgav1Decoder Libgav1Decoder;
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderCreate(
+ const Libgav1DecoderSettings* settings, Libgav1Decoder** decoder_out);
+
+LIBGAV1_PUBLIC void Libgav1DecoderDestroy(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderEnqueueFrame(
+ Libgav1Decoder* decoder, const uint8_t* data, size_t size,
+ int64_t user_private_data, void* buffer_private_data);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderDequeueFrame(
+ Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr);
+
+LIBGAV1_PUBLIC Libgav1StatusCode
+Libgav1DecoderSignalEOS(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC int Libgav1DecoderGetMaxBitdepth(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+namespace libgav1 {
+
+// Forward declaration.
+class DecoderImpl;
+
+class LIBGAV1_PUBLIC Decoder {
+ public:
+ Decoder();
+ ~Decoder();
+
+ // Init must be called exactly once per instance. Subsequent calls will do
+ // nothing. If |settings| is nullptr, the decoder will be initialized with
+ // default settings. Returns kStatusOk on success, an error status otherwise.
+ StatusCode Init(const DecoderSettings* settings);
+
+ // Enqueues a compressed frame to be decoded.
+ //
+ // This function returns:
+ // * kStatusOk on success
+ // * kStatusTryAgain if the decoder queue is full
+ // * an error status otherwise.
+ //
+ // |user_private_data| may be used to associate application specific private
+ // data with the compressed frame. It will be copied to the user_private_data
+ // field of the DecoderBuffer returned by the corresponding |DequeueFrame()|
+ // call.
+ //
+ // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
+ // successful |EnqueueFrame()| call, the caller must keep the |data| buffer
+ // alive until:
+ // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer
+ // must be kept alive until release_input_buffer is called with the
+ // |buffer_private_data| passed into this EnqueueFrame call.
+ // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must
+ // be kept alive until the corresponding DequeueFrame() call is completed.
+ //
+ // If the call to |EnqueueFrame()| is not successful, then libgav1 will not
+ // hold any references to the |data| buffer. |settings_.release_input_buffer|
+ // callback will not be called in that case.
+ StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+ int64_t user_private_data, void* buffer_private_data);
+
+ // Dequeues a decompressed frame. If there are enqueued compressed frames,
+ // decodes one and sets |*out_ptr| to the last displayable frame in the
+ // compressed frame. If there are no displayable frames available, sets
+ // |*out_ptr| to nullptr.
+ //
+ // Returns kStatusOk on success. Returns kStatusNothingToDequeue if there are
+ // no enqueued frames (in this case out_ptr will always be set to nullptr).
+ // Returns one of the other error statuses if there is an error.
+ //
+ // If |settings_.blocking_dequeue| is false and the decoder is operating in
+ // frame parallel mode (|settings_.frame_parallel| is true and the video
+ // stream passes the decoder's heuristics for enabling frame parallel mode),
+ // then this call will return kStatusTryAgain if an enqueued frame is not yet
+ // decoded (it is a non blocking call in this case). In all other cases, this
+ // call will block until an enqueued frame has been decoded.
+ StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+
+ // Signals the end of stream.
+ //
+ // In non-frame-parallel mode, this function will release all the frames held
+ // by the decoder. If the frame buffers were allocated by libgav1, then the
+ // pointer obtained by the prior DequeueFrame call will no longer be valid. If
+ // the frame buffers were allocated by the application, then any references
+ // that libgav1 is holding on to will be released.
+ //
+ // Once this function returns successfully, the decoder state will be reset
+ // and the decoder is ready to start decoding a new coded video sequence.
+ StatusCode SignalEOS();
+
+ // Returns the maximum bitdepth that is supported by this decoder.
+ static int GetMaxBitdepth();
+
+ private:
+ DecoderSettings settings_;
+ // The object is initialized if and only if impl_ != nullptr.
+ std::unique_ptr<DecoderImpl> impl_;
+};
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_DECODER_H_
diff --git a/src/gav1/decoder_buffer.h b/src/gav1/decoder_buffer.h
new file mode 100644
index 0000000..37bcb29
--- /dev/null
+++ b/src/gav1/decoder_buffer.h
@@ -0,0 +1,279 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif // defined(__cplusplus)
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+// The documentation for the enum values in this file can be found in Section
+// 6.4.2 of the AV1 spec.
+
+typedef enum Libgav1ChromaSamplePosition {
+ kLibgav1ChromaSamplePositionUnknown,
+ kLibgav1ChromaSamplePositionVertical,
+ kLibgav1ChromaSamplePositionColocated,
+ kLibgav1ChromaSamplePositionReserved
+} Libgav1ChromaSamplePosition;
+
+typedef enum Libgav1ImageFormat {
+ kLibgav1ImageFormatYuv420,
+ kLibgav1ImageFormatYuv422,
+ kLibgav1ImageFormatYuv444,
+ kLibgav1ImageFormatMonochrome400
+} Libgav1ImageFormat;
+
+typedef enum Libgav1ColorPrimary {
+ // 0 is reserved.
+ kLibgav1ColorPrimaryBt709 = 1,
+ kLibgav1ColorPrimaryUnspecified,
+ // 3 is reserved.
+ kLibgav1ColorPrimaryBt470M = 4,
+ kLibgav1ColorPrimaryBt470Bg,
+ kLibgav1ColorPrimaryBt601,
+ kLibgav1ColorPrimarySmpte240,
+ kLibgav1ColorPrimaryGenericFilm,
+ kLibgav1ColorPrimaryBt2020,
+ kLibgav1ColorPrimaryXyz,
+ kLibgav1ColorPrimarySmpte431,
+ kLibgav1ColorPrimarySmpte432,
+ // 13-21 are reserved.
+ kLibgav1ColorPrimaryEbu3213 = 22,
+ // 23-254 are reserved.
+ kLibgav1MaxColorPrimaries = 255
+} Libgav1ColorPrimary;
+
+typedef enum Libgav1TransferCharacteristics {
+ // 0 is reserved.
+ kLibgav1TransferCharacteristicsBt709 = 1,
+ kLibgav1TransferCharacteristicsUnspecified,
+ // 3 is reserved.
+ kLibgav1TransferCharacteristicsBt470M = 4,
+ kLibgav1TransferCharacteristicsBt470Bg,
+ kLibgav1TransferCharacteristicsBt601,
+ kLibgav1TransferCharacteristicsSmpte240,
+ kLibgav1TransferCharacteristicsLinear,
+ kLibgav1TransferCharacteristicsLog100,
+ kLibgav1TransferCharacteristicsLog100Sqrt10,
+ kLibgav1TransferCharacteristicsIec61966,
+ kLibgav1TransferCharacteristicsBt1361,
+ kLibgav1TransferCharacteristicsSrgb,
+ kLibgav1TransferCharacteristicsBt2020TenBit,
+ kLibgav1TransferCharacteristicsBt2020TwelveBit,
+ kLibgav1TransferCharacteristicsSmpte2084,
+ kLibgav1TransferCharacteristicsSmpte428,
+ kLibgav1TransferCharacteristicsHlg,
+ // 19-254 are reserved.
+ kLibgav1MaxTransferCharacteristics = 255
+} Libgav1TransferCharacteristics;
+
+typedef enum Libgav1MatrixCoefficients {
+ kLibgav1MatrixCoefficientsIdentity,
+ kLibgav1MatrixCoefficientsBt709,
+ kLibgav1MatrixCoefficientsUnspecified,
+ // 3 is reserved.
+ kLibgav1MatrixCoefficientsFcc = 4,
+ kLibgav1MatrixCoefficientsBt470BG,
+ kLibgav1MatrixCoefficientsBt601,
+ kLibgav1MatrixCoefficientsSmpte240,
+ kLibgav1MatrixCoefficientsSmpteYcgco,
+ kLibgav1MatrixCoefficientsBt2020Ncl,
+ kLibgav1MatrixCoefficientsBt2020Cl,
+ kLibgav1MatrixCoefficientsSmpte2085,
+ kLibgav1MatrixCoefficientsChromatNcl,
+ kLibgav1MatrixCoefficientsChromatCl,
+ kLibgav1MatrixCoefficientsIctcp,
+ // 15-254 are reserved.
+ kLibgav1MaxMatrixCoefficients = 255
+} Libgav1MatrixCoefficients;
+
+typedef enum Libgav1ColorRange {
+ // The color ranges are scaled by value << (bitdepth - 8) for 10 and 12bit
+ // streams.
+ kLibgav1ColorRangeStudio, // Y [16..235], UV [16..240]
+ kLibgav1ColorRangeFull // YUV/RGB [0..255]
+} Libgav1ColorRange;
+
+typedef struct Libgav1DecoderBuffer {
+#if defined(__cplusplus)
+ LIBGAV1_PUBLIC int NumPlanes() const {
+ return (image_format == kLibgav1ImageFormatMonochrome400) ? 1 : 3;
+ }
+#endif // defined(__cplusplus)
+
+ Libgav1ChromaSamplePosition chroma_sample_position;
+ Libgav1ImageFormat image_format;
+ Libgav1ColorRange color_range;
+ Libgav1ColorPrimary color_primary;
+ Libgav1TransferCharacteristics transfer_characteristics;
+ Libgav1MatrixCoefficients matrix_coefficients;
+
+ // Image storage dimensions.
+ // NOTE: These fields are named w and h in vpx_image_t and aom_image_t.
+ // uint32_t width; // Stored image width.
+ // uint32_t height; // Stored image height.
+ int bitdepth; // Stored image bitdepth.
+
+ // Image display dimensions.
+ // NOTES:
+ // 1. These fields are named d_w and d_h in vpx_image_t and aom_image_t.
+ // 2. libvpx and libaom clients use d_w and d_h much more often than w and h.
+ // 3. These fields can just be stored for the Y plane and the clients can
+ // calculate the values for the U and V planes if the image format or
+ // subsampling is exposed.
+ int displayed_width[3]; // Displayed image width.
+ int displayed_height[3]; // Displayed image height.
+
+ int stride[3];
+ uint8_t* plane[3];
+
+ // Spatial id of this frame.
+ int spatial_id;
+ // Temporal id of this frame.
+ int temporal_id;
+
+ // The |user_private_data| argument passed to Decoder::EnqueueFrame().
+ int64_t user_private_data;
+ // The |private_data| field of FrameBuffer. Set by the get frame buffer
+ // callback when it allocates a frame buffer.
+ void* buffer_private_data;
+} Libgav1DecoderBuffer;
+
+#if defined(__cplusplus)
+namespace libgav1 {
+
+using ChromaSamplePosition = Libgav1ChromaSamplePosition;
+constexpr ChromaSamplePosition kChromaSamplePositionUnknown =
+ kLibgav1ChromaSamplePositionUnknown;
+constexpr ChromaSamplePosition kChromaSamplePositionVertical =
+ kLibgav1ChromaSamplePositionVertical;
+constexpr ChromaSamplePosition kChromaSamplePositionColocated =
+ kLibgav1ChromaSamplePositionColocated;
+constexpr ChromaSamplePosition kChromaSamplePositionReserved =
+ kLibgav1ChromaSamplePositionReserved;
+
+using ImageFormat = Libgav1ImageFormat;
+constexpr ImageFormat kImageFormatYuv420 = kLibgav1ImageFormatYuv420;
+constexpr ImageFormat kImageFormatYuv422 = kLibgav1ImageFormatYuv422;
+constexpr ImageFormat kImageFormatYuv444 = kLibgav1ImageFormatYuv444;
+constexpr ImageFormat kImageFormatMonochrome400 =
+ kLibgav1ImageFormatMonochrome400;
+
+using ColorPrimary = Libgav1ColorPrimary;
+constexpr ColorPrimary kColorPrimaryBt709 = kLibgav1ColorPrimaryBt709;
+constexpr ColorPrimary kColorPrimaryUnspecified =
+ kLibgav1ColorPrimaryUnspecified;
+constexpr ColorPrimary kColorPrimaryBt470M = kLibgav1ColorPrimaryBt470M;
+constexpr ColorPrimary kColorPrimaryBt470Bg = kLibgav1ColorPrimaryBt470Bg;
+constexpr ColorPrimary kColorPrimaryBt601 = kLibgav1ColorPrimaryBt601;
+constexpr ColorPrimary kColorPrimarySmpte240 = kLibgav1ColorPrimarySmpte240;
+constexpr ColorPrimary kColorPrimaryGenericFilm =
+ kLibgav1ColorPrimaryGenericFilm;
+constexpr ColorPrimary kColorPrimaryBt2020 = kLibgav1ColorPrimaryBt2020;
+constexpr ColorPrimary kColorPrimaryXyz = kLibgav1ColorPrimaryXyz;
+constexpr ColorPrimary kColorPrimarySmpte431 = kLibgav1ColorPrimarySmpte431;
+constexpr ColorPrimary kColorPrimarySmpte432 = kLibgav1ColorPrimarySmpte432;
+constexpr ColorPrimary kColorPrimaryEbu3213 = kLibgav1ColorPrimaryEbu3213;
+constexpr ColorPrimary kMaxColorPrimaries = kLibgav1MaxColorPrimaries;
+
+using TransferCharacteristics = Libgav1TransferCharacteristics;
+constexpr TransferCharacteristics kTransferCharacteristicsBt709 =
+ kLibgav1TransferCharacteristicsBt709;
+constexpr TransferCharacteristics kTransferCharacteristicsUnspecified =
+ kLibgav1TransferCharacteristicsUnspecified;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470M =
+ kLibgav1TransferCharacteristicsBt470M;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470Bg =
+ kLibgav1TransferCharacteristicsBt470Bg;
+constexpr TransferCharacteristics kTransferCharacteristicsBt601 =
+ kLibgav1TransferCharacteristicsBt601;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte240 =
+ kLibgav1TransferCharacteristicsSmpte240;
+constexpr TransferCharacteristics kTransferCharacteristicsLinear =
+ kLibgav1TransferCharacteristicsLinear;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100 =
+ kLibgav1TransferCharacteristicsLog100;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100Sqrt10 =
+ kLibgav1TransferCharacteristicsLog100Sqrt10;
+constexpr TransferCharacteristics kTransferCharacteristicsIec61966 =
+ kLibgav1TransferCharacteristicsIec61966;
+constexpr TransferCharacteristics kTransferCharacteristicsBt1361 =
+ kLibgav1TransferCharacteristicsBt1361;
+constexpr TransferCharacteristics kTransferCharacteristicsSrgb =
+ kLibgav1TransferCharacteristicsSrgb;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TenBit =
+ kLibgav1TransferCharacteristicsBt2020TenBit;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TwelveBit =
+ kLibgav1TransferCharacteristicsBt2020TwelveBit;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte2084 =
+ kLibgav1TransferCharacteristicsSmpte2084;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte428 =
+ kLibgav1TransferCharacteristicsSmpte428;
+constexpr TransferCharacteristics kTransferCharacteristicsHlg =
+ kLibgav1TransferCharacteristicsHlg;
+constexpr TransferCharacteristics kMaxTransferCharacteristics =
+ kLibgav1MaxTransferCharacteristics;
+
+using MatrixCoefficients = Libgav1MatrixCoefficients;
+constexpr MatrixCoefficients kMatrixCoefficientsIdentity =
+ kLibgav1MatrixCoefficientsIdentity;
+constexpr MatrixCoefficients kMatrixCoefficientsBt709 =
+ kLibgav1MatrixCoefficientsBt709;
+constexpr MatrixCoefficients kMatrixCoefficientsUnspecified =
+ kLibgav1MatrixCoefficientsUnspecified;
+constexpr MatrixCoefficients kMatrixCoefficientsFcc =
+ kLibgav1MatrixCoefficientsFcc;
+constexpr MatrixCoefficients kMatrixCoefficientsBt470BG =
+ kLibgav1MatrixCoefficientsBt470BG;
+constexpr MatrixCoefficients kMatrixCoefficientsBt601 =
+ kLibgav1MatrixCoefficientsBt601;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte240 =
+ kLibgav1MatrixCoefficientsSmpte240;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpteYcgco =
+ kLibgav1MatrixCoefficientsSmpteYcgco;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Ncl =
+ kLibgav1MatrixCoefficientsBt2020Ncl;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Cl =
+ kLibgav1MatrixCoefficientsBt2020Cl;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte2085 =
+ kLibgav1MatrixCoefficientsSmpte2085;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatNcl =
+ kLibgav1MatrixCoefficientsChromatNcl;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatCl =
+ kLibgav1MatrixCoefficientsChromatCl;
+constexpr MatrixCoefficients kMatrixCoefficientsIctcp =
+ kLibgav1MatrixCoefficientsIctcp;
+constexpr MatrixCoefficients kMaxMatrixCoefficients =
+ kLibgav1MaxMatrixCoefficients;
+
+using ColorRange = Libgav1ColorRange;
+constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio;
+constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull;
+
+using DecoderBuffer = Libgav1DecoderBuffer;
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
diff --git a/src/gav1/decoder_settings.h b/src/gav1/decoder_settings.h
new file mode 100644
index 0000000..ab22a4d
--- /dev/null
+++ b/src/gav1/decoder_settings.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+#define LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif // defined(__cplusplus)
+
+#include "gav1/frame_buffer.h"
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This callback is invoked by the decoder when it is done using an input frame
+// buffer. When frame_parallel is set to true, this callback must not be
+// nullptr. Otherwise, this callback is optional.
+//
+// |buffer_private_data| is the value passed in the EnqueueFrame() call.
+typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data,
+ void* buffer_private_data);
+
+typedef struct Libgav1DecoderSettings {
+ // Number of threads to use when decoding. Must be greater than 0. The library
+ // will create at most |threads| new threads. Defaults to 1 (no new threads
+ // will be created).
+ int threads;
+ // A boolean. Indicate to the decoder that frame parallel decoding is allowed.
+ // Note that this is just a request and the decoder will decide the number of
+ // frames to be decoded in parallel based on the video stream being decoded.
+ int frame_parallel;
+ // A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait
+ // until a enqueued frame is available for dequeueing.
+ //
+ // If frame_parallel is 0, this setting is ignored.
+ int blocking_dequeue;
+ // Called when the first sequence header or a sequence header with a
+ // different frame size (which includes bitdepth, monochrome, subsampling_x,
+ // subsampling_y, maximum frame width, or maximum frame height) is received.
+ Libgav1FrameBufferSizeChangedCallback on_frame_buffer_size_changed;
+ // Get frame buffer callback.
+ Libgav1GetFrameBufferCallback get_frame_buffer;
+ // Release frame buffer callback.
+ Libgav1ReleaseFrameBufferCallback release_frame_buffer;
+ // Release input frame buffer callback.
+ Libgav1ReleaseInputBufferCallback release_input_buffer;
+ // Passed as the private_data argument to the callbacks.
+ void* callback_private_data;
+ // A boolean. If set to 1, the decoder will output all the spatial and
+ // temporal layers.
+ int output_all_layers;
+ // Index of the operating point to decode.
+ int operating_point;
+ // Mask indicating the post processing filters that need to be applied to the
+ // reconstructed frame. Note this is an advanced setting and does not
+ // typically need to be changed.
+ // From LSB:
+ // Bit 0: Loop filter (deblocking filter).
+ // Bit 1: Cdef.
+ // Bit 2: SuperRes.
+ // Bit 3: Loop restoration.
+ // Bit 4: Film grain synthesis.
+ // All the bits other than the last 5 are ignored.
+ uint8_t post_filter_mask;
+} Libgav1DecoderSettings;
+
+LIBGAV1_PUBLIC void Libgav1DecoderSettingsInitDefault(
+ Libgav1DecoderSettings* settings);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+namespace libgav1 {
+
+using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback;
+
+// Applications must populate this structure before creating a decoder instance.
+struct DecoderSettings {
+ // Number of threads to use when decoding. Must be greater than 0. The library
+ // will create at most |threads| new threads. Defaults to 1 (no new threads
+ // will be created).
+ int threads = 1;
+ // Indicate to the decoder that frame parallel decoding is allowed. Note that
+ // this is just a request and the decoder will decide the number of frames to
+ // be decoded in parallel based on the video stream being decoded.
+ bool frame_parallel = false;
+ // In frame parallel mode, should DequeueFrame wait until a enqueued frame is
+ // available for dequeueing.
+ //
+ // If frame_parallel is false, this setting is ignored.
+ bool blocking_dequeue = false;
+ // Called when the first sequence header or a sequence header with a
+ // different frame size (which includes bitdepth, monochrome, subsampling_x,
+ // subsampling_y, maximum frame width, or maximum frame height) is received.
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+ // Get frame buffer callback.
+ GetFrameBufferCallback get_frame_buffer = nullptr;
+ // Release frame buffer callback.
+ ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+ // Release input frame buffer callback.
+ ReleaseInputBufferCallback release_input_buffer = nullptr;
+ // Passed as the private_data argument to the callbacks.
+ void* callback_private_data = nullptr;
+ // If set to true, the decoder will output all the spatial and temporal
+ // layers.
+ bool output_all_layers = false;
+ // Index of the operating point to decode.
+ int operating_point = 0;
+ // Mask indicating the post processing filters that need to be applied to the
+ // reconstructed frame. Note this is an advanced setting and does not
+ // typically need to be changed.
+ // From LSB:
+ // Bit 0: Loop filter (deblocking filter).
+ // Bit 1: Cdef.
+ // Bit 2: SuperRes.
+ // Bit 3: Loop restoration.
+ // Bit 4: Film grain synthesis.
+ // All the bits other than the last 5 are ignored.
+ uint8_t post_filter_mask = 0x1f;
+};
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+#endif // LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
diff --git a/src/gav1/frame_buffer.h b/src/gav1/frame_buffer.h
new file mode 100644
index 0000000..8132b61
--- /dev/null
+++ b/src/gav1/frame_buffer.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif // defined(__cplusplus)
+
+#include "gav1/decoder_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+
+// The callback functions use the C linkage conventions.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This structure represents an allocated frame buffer.
+typedef struct Libgav1FrameBuffer {
+ // In the |plane| and |stride| arrays, the elements at indexes 0, 1, and 2
+ // are for the Y, U, and V planes, respectively.
+ uint8_t* plane[3]; // Pointers to the frame (excluding the borders) in the
+ // data buffers.
+ int stride[3]; // Row strides in bytes.
+ void* private_data; // Frame buffer's private data. Available for use by the
+ // release frame buffer callback. Also copied to the
+ // |buffer_private_data| field of DecoderBuffer for use
+ // by the consumer of a DecoderBuffer.
+} Libgav1FrameBuffer;
+
+// This callback is invoked by the decoder to provide information on the
+// subsequent frames in the video, until the next invocation of this callback
+// or the end of the video.
+//
+// |width| and |height| are the maximum frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// maximum left, right, top, and bottom border sizes in pixels.
+// |stride_alignment| specifies the alignment of the row stride in bytes.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+//
+// NOTE: This callback may be omitted if the information is not useful to the
+// application.
+typedef Libgav1StatusCode (*Libgav1FrameBufferSizeChangedCallback)(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment);
+
+// This callback is invoked by the decoder to allocate a frame buffer, which
+// consists of three data buffers, for the Y, U, and V planes, respectively.
+//
+// The callback must set |frame_buffer->plane[i]| to point to the data buffers
+// of the planes, and set |frame_buffer->stride[i]| to the row strides of the
+// planes. If |image_format| is kLibgav1ImageFormatMonochrome400, the callback
+// should set |frame_buffer->plane[1]| and |frame_buffer->plane[2]| to a null
+// pointer and set |frame_buffer->stride[1]| and |frame_buffer->stride[2]| to
+// 0. The callback may set |frame_buffer->private_data| to a value that will
+// be useful to the release frame buffer callback and the consumer of a
+// DecoderBuffer.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+
+// |width| and |height| are the frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// left, right, top, and bottom border sizes in pixels. |stride_alignment|
+// specifies the alignment of the row stride in bytes.
+typedef Libgav1StatusCode (*Libgav1GetFrameBufferCallback)(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+// After a frame buffer is allocated, the decoder starts to write decoded video
+// to the frame buffer. When the frame buffer is ready for consumption, it is
+// made available to the application in a Decoder::DequeueFrame() call.
+// Afterwards, the decoder may continue to use the frame buffer in read-only
+// mode. When the decoder is finished using the frame buffer, it notifies the
+// application by calling the Libgav1ReleaseFrameBufferCallback.
+
+// This callback is invoked by the decoder to release a frame buffer.
+typedef void (*Libgav1ReleaseFrameBufferCallback)(void* callback_private_data,
+ void* buffer_private_data);
+
+// Libgav1ComputeFrameBufferInfo() and Libgav1SetFrameBuffer() are intended to
+// help clients implement frame buffer callbacks using memory buffers. First,
+// call Libgav1ComputeFrameBufferInfo(). If it succeeds, allocate y_buffer of
+// size info.y_buffer_size and allocate u_buffer and v_buffer, both of size
+// info.uv_buffer_size. Finally, pass y_buffer, u_buffer, v_buffer, and
+// buffer_private_data to Libgav1SetFrameBuffer().
+
+// This structure contains information useful for allocating memory for a frame
+// buffer.
+typedef struct Libgav1FrameBufferInfo {
+ size_t y_buffer_size; // Size in bytes of the Y buffer.
+ size_t uv_buffer_size; // Size in bytes of the U or V buffer.
+
+ // The following fields are consumed by Libgav1SetFrameBuffer(). Do not use
+ // them directly.
+ int y_stride; // Row stride in bytes of the Y buffer.
+ int uv_stride; // Row stride in bytes of the U or V buffer.
+ size_t y_plane_offset; // Offset in bytes of the frame (excluding the
+ // borders) in the Y buffer.
+ size_t uv_plane_offset; // Offset in bytes of the frame (excluding the
+ // borders) in the U or V buffer.
+ int stride_alignment; // The stride_alignment argument passed to
+ // Libgav1ComputeFrameBufferInfo().
+} Libgav1FrameBufferInfo;
+
+// Computes the information useful for allocating memory for a frame buffer.
+// On success, stores the output in |info|.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment, Libgav1FrameBufferInfo* info);
+
+// Sets the |frame_buffer| struct.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1SetFrameBuffer(
+ const Libgav1FrameBufferInfo* info, uint8_t* y_buffer, uint8_t* u_buffer,
+ uint8_t* v_buffer, void* buffer_private_data,
+ Libgav1FrameBuffer* frame_buffer);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+// Declare type aliases for C++.
+namespace libgav1 {
+
+using FrameBuffer = Libgav1FrameBuffer;
+using FrameBufferSizeChangedCallback = Libgav1FrameBufferSizeChangedCallback;
+using GetFrameBufferCallback = Libgav1GetFrameBufferCallback;
+using ReleaseFrameBufferCallback = Libgav1ReleaseFrameBufferCallback;
+using FrameBufferInfo = Libgav1FrameBufferInfo;
+
+inline StatusCode ComputeFrameBufferInfo(int bitdepth, ImageFormat image_format,
+ int width, int height, int left_border,
+ int right_border, int top_border,
+ int bottom_border,
+ int stride_alignment,
+ FrameBufferInfo* info) {
+ return Libgav1ComputeFrameBufferInfo(bitdepth, image_format, width, height,
+ left_border, right_border, top_border,
+ bottom_border, stride_alignment, info);
+}
+
+inline StatusCode SetFrameBuffer(const FrameBufferInfo* info, uint8_t* y_buffer,
+ uint8_t* u_buffer, uint8_t* v_buffer,
+ void* buffer_private_data,
+ FrameBuffer* frame_buffer) {
+ return Libgav1SetFrameBuffer(info, y_buffer, u_buffer, v_buffer,
+ buffer_private_data, frame_buffer);
+}
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
diff --git a/src/gav1/status_code.h b/src/gav1/status_code.h
new file mode 100644
index 0000000..d7476ca
--- /dev/null
+++ b/src/gav1/status_code.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+#define LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+// The Libgav1StatusCode enum type: A libgav1 function may return
+// Libgav1StatusCode to indicate success or the reason for failure.
+typedef enum {
+ // Success.
+ kLibgav1StatusOk = 0,
+
+ // An unknown error. Used as the default error status if error detail is not
+ // available.
+ kLibgav1StatusUnknownError = -1,
+
+ // An invalid function argument.
+ kLibgav1StatusInvalidArgument = -2,
+
+ // Memory allocation failure.
+ kLibgav1StatusOutOfMemory = -3,
+
+ // Ran out of a resource (other than memory).
+ kLibgav1StatusResourceExhausted = -4,
+
+ // The object is not initialized.
+ kLibgav1StatusNotInitialized = -5,
+
+ // An operation that can only be performed once has already been performed.
+ kLibgav1StatusAlready = -6,
+
+ // Not implemented, or not supported.
+ kLibgav1StatusUnimplemented = -7,
+
+ // An internal error in libgav1. Usually this indicates a programming error.
+ kLibgav1StatusInternalError = -8,
+
+ // The bitstream is not encoded correctly or violates a bitstream conformance
+ // requirement.
+ kLibgav1StatusBitstreamError = -9,
+
+ // The operation is not allowed at the moment. This is not a fatal error. Try
+ // again later.
+ kLibgav1StatusTryAgain = -10,
+
+ // Used only by DequeueFrame(). There are no enqueued frames, so there is
+ // nothing to dequeue. This is not a fatal error. Try enqueuing a frame before
+ // trying to dequeue again.
+ kLibgav1StatusNothingToDequeue = -11,
+
+ // An extra enumerator to prevent people from writing code that fails to
+ // compile when a new status code is added.
+ //
+ // Do not reference this enumerator. In particular, if you write code that
+ // switches on Libgav1StatusCode, add a default: case instead of a case that
+ // mentions this enumerator.
+ //
+ // Do not depend on the value (currently -1000) listed here. It may change in
+ // the future.
+ kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_ = -1000
+} Libgav1StatusCode;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetErrorString(Libgav1StatusCode status);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+namespace libgav1 {
+
+// Declare type aliases for C++.
+using StatusCode = Libgav1StatusCode;
+constexpr StatusCode kStatusOk = kLibgav1StatusOk;
+constexpr StatusCode kStatusUnknownError = kLibgav1StatusUnknownError;
+constexpr StatusCode kStatusInvalidArgument = kLibgav1StatusInvalidArgument;
+constexpr StatusCode kStatusOutOfMemory = kLibgav1StatusOutOfMemory;
+constexpr StatusCode kStatusResourceExhausted = kLibgav1StatusResourceExhausted;
+constexpr StatusCode kStatusNotInitialized = kLibgav1StatusNotInitialized;
+constexpr StatusCode kStatusAlready = kLibgav1StatusAlready;
+constexpr StatusCode kStatusUnimplemented = kLibgav1StatusUnimplemented;
+constexpr StatusCode kStatusInternalError = kLibgav1StatusInternalError;
+constexpr StatusCode kStatusBitstreamError = kLibgav1StatusBitstreamError;
+constexpr StatusCode kStatusTryAgain = kLibgav1StatusTryAgain;
+constexpr StatusCode kStatusNothingToDequeue = kLibgav1StatusNothingToDequeue;
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+inline const char* GetErrorString(StatusCode status) {
+ return Libgav1GetErrorString(status);
+}
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_STATUS_CODE_H_
diff --git a/src/gav1/symbol_visibility.h b/src/gav1/symbol_visibility.h
new file mode 100644
index 0000000..ad7498c
--- /dev/null
+++ b/src/gav1/symbol_visibility.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+#define LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+
+// This module defines the LIBGAV1_PUBLIC macro. LIBGAV1_PUBLIC, when combined
+// with the flags -fvisibility=hidden and -fvisibility-inlines-hidden, restricts
+// symbol availability when users use the shared object form of libgav1. The
+// intent is to prevent exposure of libgav1 internals to users of the library,
+// and to avoid ABI compatibility problems that changes to libgav1 internals
+// would cause for users of the libgav1 shared object.
+//
+// Examples:
+//
+// This form makes a class and all of its members part of the public API:
+//
+// class LIBGAV1_PUBLIC A {
+// public:
+// A();
+// ~A();
+// void Foo();
+// int Bar();
+// };
+//
+// A::A(), A::~A(), A::Foo(), and A::Bar() are all available to code linking to
+// the shared object when this form is used.
+//
+// This form exposes a single class method as part of the public API:
+//
+// class B {
+// public:
+// B();
+// ~B();
+// LIBGAV1_PUBLIC int Foo();
+// };
+//
+// In this examples only B::Foo() is available to the user of the shared object.
+//
+// Non-class member functions can also be exposed individually:
+//
+// LIBGAV1_PUBLIC void Bar();
+//
+// In this example Bar() would be available to users of the shared object.
+//
+// Much of the above information and more can be found at
+// https://gcc.gnu.org/wiki/Visibility
+
+#if !defined(LIBGAV1_PUBLIC)
+#if defined(_WIN32)
+#if defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#if defined(__GNUC__)
+#define LIBGAV1_PUBLIC __attribute__((dllexport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllexport)
+#endif // defined(__GNUC__)
+#elif defined(LIBGAV1_BUILDING_DLL)
+#ifdef __GNUC__
+#define LIBGAV1_PUBLIC __attribute__((dllimport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllimport)
+#endif // defined(__GNUC__)
+#else
+#define LIBGAV1_PUBLIC
+#endif // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#else
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define LIBGAV1_PUBLIC __attribute__((visibility("default")))
+#else
+#define LIBGAV1_PUBLIC
+#endif
+#endif // defined(_WIN32)
+#endif // defined(LIBGAV1_PUBLIC)
+
+#endif // LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
diff --git a/src/gav1/version.h b/src/gav1/version.h
new file mode 100644
index 0000000..78a573e
--- /dev/null
+++ b/src/gav1/version.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_VERSION_H_
+#define LIBGAV1_SRC_GAV1_VERSION_H_
+
+#include "gav1/symbol_visibility.h"
+
+// This library follows the principles described by Semantic Versioning
+// (https://semver.org).
+
+#define LIBGAV1_MAJOR_VERSION 0
+#define LIBGAV1_MINOR_VERSION 16
+#define LIBGAV1_PATCH_VERSION 1
+
+#define LIBGAV1_VERSION \
+ ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \
+ LIBGAV1_PATCH_VERSION)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+LIBGAV1_PUBLIC int Libgav1GetVersion(void);
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetVersionString(void);
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetBuildConfiguration(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+namespace libgav1 {
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+inline int GetVersion() { return Libgav1GetVersion(); }
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+inline const char* GetVersionString() { return Libgav1GetVersionString(); }
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+inline const char* GetBuildConfiguration() {
+ return Libgav1GetBuildConfiguration();
+}
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_VERSION_H_
diff --git a/src/inter_intra_masks.inc b/src/inter_intra_masks.inc
new file mode 100644
index 0000000..2c15f9c
--- /dev/null
+++ b/src/inter_intra_masks.inc
@@ -0,0 +1,581 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the inter intra masks
+// from the code where it is used.
+
+// The tables in this file are computed based on section 7.11.3.13 in the spec.
+
+constexpr uint8_t kInterIntraMaskDc[] = {
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+constexpr uint8_t kInterIntraMaskVertical4x4[] = {
+ 60, 60, 60, 60, 19, 19, 19, 19, 6, 6, 6, 6, 2, 2, 2, 2};
+constexpr uint8_t kInterIntraMaskVertical4x8[] = {
+ 60, 60, 60, 60, 34, 34, 34, 34, 19, 19, 19, 19, 11, 11, 11, 11,
+ 6, 6, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical8x4[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+ 19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskVertical8x8[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+ 19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11,
+ 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4,
+ 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical8x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34,
+ 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19,
+ 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11, 11, 11, 11, 8,
+ 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5,
+ 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3,
+ 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical16x8[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+constexpr uint8_t kInterIntraMaskVertical16x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical16x32[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical32x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+constexpr uint8_t kInterIntraMaskVertical32x32[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+constexpr uint8_t kInterIntraMaskHorizontal4x4[] = {60, 19, 6, 2, 60, 19, 6, 2,
+ 60, 19, 6, 2, 60, 19, 6, 2};
+constexpr uint8_t kInterIntraMaskHorizontal4x8[] = {
+ 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11,
+ 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskHorizontal8x4[] = {
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x8[] = {
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x16[] = {
+ 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34,
+ 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15,
+ 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60,
+ 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26,
+ 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11,
+ 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45,
+ 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskHorizontal16x8[] = {
+ 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34,
+ 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15,
+ 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6,
+ 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3,
+ 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1,
+ 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45,
+ 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1};
+constexpr uint8_t kInterIntraMaskHorizontal16x16[] = {
+ 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34,
+ 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15,
+ 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6,
+ 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3,
+ 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1,
+ 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45,
+ 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19,
+ 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8,
+ 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4,
+ 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2,
+ 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60,
+ 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26,
+ 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11,
+ 8, 6, 5, 4, 3, 2, 2, 1, 1};
+constexpr uint8_t kInterIntraMaskHorizontal16x32[] = {
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60,
+ 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26,
+ 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+ 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7};
+constexpr uint8_t kInterIntraMaskHorizontal32x16[] = {
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5,
+ 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2,
+ 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1,
+ 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6,
+ 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2,
+ 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1,
+ 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6,
+ 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3,
+ 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3,
+ 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4,
+ 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskHorizontal32x32[] = {
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5,
+ 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2,
+ 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1,
+ 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6,
+ 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2,
+ 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1,
+ 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6,
+ 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3,
+ 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3,
+ 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4,
+ 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60,
+ 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4,
+ 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26,
+ 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+ 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5,
+ 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2,
+ 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1,
+ 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6,
+ 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2,
+ 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1,
+ 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6,
+ 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3,
+ 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3,
+ 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1};
+
+constexpr uint8_t kInterIntraMaskSmooth4x4[] = {60, 60, 60, 60, 60, 19, 19, 19,
+ 60, 19, 6, 6, 60, 19, 6, 2};
+constexpr uint8_t kInterIntraMaskSmooth4x8[] = {
+ 60, 60, 60, 60, 60, 34, 34, 34, 60, 34, 19, 19, 60, 34, 19, 11,
+ 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x4[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+ 60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x8[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+ 60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11,
+ 60, 34, 19, 11, 6, 6, 6, 6, 60, 34, 19, 11, 6, 4, 4, 4,
+ 60, 34, 19, 11, 6, 4, 2, 2, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskSmooth8x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34,
+ 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19,
+ 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 60, 45, 34, 26, 19, 15, 11, 11, 60,
+ 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26,
+ 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11,
+ 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45,
+ 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskSmooth16x8[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+ 34, 26, 19, 15, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+constexpr uint8_t kInterIntraMaskSmooth16x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+ 34, 26, 19, 15, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 60, 45, 34, 26, 19,
+ 15, 11, 8, 6, 6, 6, 6, 6, 6, 6, 6, 60, 45, 34, 26, 19, 15, 11, 8,
+ 6, 5, 5, 5, 5, 5, 5, 5, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4,
+ 4, 4, 4, 4, 4, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 3, 3,
+ 3, 3, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 2, 2, 60,
+ 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 2, 2, 60, 45, 34, 26,
+ 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11,
+ 8, 6, 5, 4, 3, 2, 2, 1, 1};
+constexpr uint8_t kInterIntraMaskSmooth16x32[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 52, 45, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52,
+ 45, 39, 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 19, 19, 19, 19, 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 13, 13,
+ 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11, 11, 60,
+ 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 8, 60, 52, 45, 39, 34, 30, 26,
+ 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+ 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7};
+constexpr uint8_t kInterIntraMaskSmooth32x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+ 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+constexpr uint8_t kInterIntraMaskSmooth32x32[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+ 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 60,
+ 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 60, 52, 45, 39, 34, 30, 26,
+ 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+ 10, 8, 7, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6,
+ 5, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6,
+ 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3,
+ 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1};
+
+// For each 2D array within this array, the indices are mapped as follows: 0, 1,
+// 2 and 3 in each dimension maps to prediction dimension 4, 8, 16 and 32
+// respectively. For example, the entry in [1][2] corresponds to a prediction
+// size of 8x16 (width == 8 and height == 16).
+const uint8_t* kInterIntraMasks[kNumInterIntraModes][4][4] = {
+ // kInterIntraModeDc. This is a special case where all the non-nullptr
+ // entries point to kInterIntraMaskDc (all entries of the array are 32). The
+ // width can be set according to the prediction size to achieve the desired
+ // result.
+ {{kInterIntraMaskDc, kInterIntraMaskDc, nullptr, nullptr},
+ {kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc, nullptr},
+ {nullptr, kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc},
+ {nullptr, nullptr, kInterIntraMaskDc, kInterIntraMaskDc}},
+ // kInterIntraModeVertical
+ {{kInterIntraMaskVertical4x4, kInterIntraMaskVertical4x8, nullptr, nullptr},
+ {kInterIntraMaskVertical8x4, kInterIntraMaskVertical8x8,
+ kInterIntraMaskVertical8x16, nullptr},
+ {nullptr, kInterIntraMaskVertical16x8, kInterIntraMaskVertical16x16,
+ kInterIntraMaskVertical16x32},
+ {nullptr, nullptr, kInterIntraMaskVertical32x16,
+ kInterIntraMaskVertical32x32}},
+ // kInterIntraModeHorizontal
+ {{kInterIntraMaskHorizontal4x4, kInterIntraMaskHorizontal4x8, nullptr,
+ nullptr},
+ {kInterIntraMaskHorizontal8x4, kInterIntraMaskHorizontal8x8,
+ kInterIntraMaskHorizontal8x16, nullptr},
+ {nullptr, kInterIntraMaskHorizontal16x8, kInterIntraMaskHorizontal16x16,
+ kInterIntraMaskHorizontal16x32},
+ {nullptr, nullptr, kInterIntraMaskHorizontal32x16,
+ kInterIntraMaskHorizontal32x32}},
+ // kInterIntraModeSmooth
+ {{kInterIntraMaskSmooth4x4, kInterIntraMaskSmooth4x8, nullptr, nullptr},
+ {kInterIntraMaskSmooth8x4, kInterIntraMaskSmooth8x8,
+ kInterIntraMaskSmooth8x16, nullptr},
+ {nullptr, kInterIntraMaskSmooth16x8, kInterIntraMaskSmooth16x16,
+ kInterIntraMaskSmooth16x32},
+ {nullptr, nullptr, kInterIntraMaskSmooth32x16,
+ kInterIntraMaskSmooth32x32}}};
diff --git a/src/internal_frame_buffer_list.cc b/src/internal_frame_buffer_list.cc
new file mode 100644
index 0000000..e2d2273
--- /dev/null
+++ b/src/internal_frame_buffer_list.cc
@@ -0,0 +1,122 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/internal_frame_buffer_list.h"
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+extern "C" {
+
+Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment) {
+ auto* buffer_list =
+ static_cast<InternalFrameBufferList*>(callback_private_data);
+ return buffer_list->OnFrameBufferSizeChanged(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment);
+}
+
+Libgav1StatusCode GetInternalFrameBuffer(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+ auto* buffer_list =
+ static_cast<InternalFrameBufferList*>(callback_private_data);
+ return buffer_list->GetFrameBuffer(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void ReleaseInternalFrameBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* buffer_list =
+ static_cast<InternalFrameBufferList*>(callback_private_data);
+ buffer_list->ReleaseFrameBuffer(buffer_private_data);
+}
+
+} // extern "C"
+
+StatusCode InternalFrameBufferList::OnFrameBufferSizeChanged(
+ int /*bitdepth*/, Libgav1ImageFormat /*image_format*/, int /*width*/,
+ int /*height*/, int /*left_border*/, int /*right_border*/,
+ int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/) {
+ return kStatusOk;
+}
+
+StatusCode InternalFrameBufferList::GetFrameBuffer(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+ FrameBufferInfo info;
+ StatusCode status = ComputeFrameBufferInfo(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, &info);
+ if (status != kStatusOk) return status;
+
+ if (info.uv_buffer_size > SIZE_MAX / 2 ||
+ info.y_buffer_size > SIZE_MAX - 2 * info.uv_buffer_size) {
+ return kStatusInvalidArgument;
+ }
+ const size_t min_size = info.y_buffer_size + 2 * info.uv_buffer_size;
+
+ Buffer* buffer = nullptr;
+ for (auto& buffer_ptr : buffers_) {
+ if (!buffer_ptr->in_use) {
+ buffer = buffer_ptr.get();
+ break;
+ }
+ }
+ if (buffer == nullptr) {
+ std::unique_ptr<Buffer> new_buffer(new (std::nothrow) Buffer);
+ if (new_buffer == nullptr || !buffers_.push_back(std::move(new_buffer))) {
+ return kStatusOutOfMemory;
+ }
+ buffer = buffers_.back().get();
+ }
+
+ if (buffer->size < min_size) {
+ std::unique_ptr<uint8_t[], MallocDeleter> new_data(
+ static_cast<uint8_t*>(malloc(min_size)));
+ if (new_data == nullptr) return kStatusOutOfMemory;
+ buffer->data = std::move(new_data);
+ buffer->size = min_size;
+ }
+
+ uint8_t* const y_buffer = buffer->data.get();
+ uint8_t* const u_buffer =
+ (info.uv_buffer_size == 0) ? nullptr : y_buffer + info.y_buffer_size;
+ uint8_t* const v_buffer =
+ (info.uv_buffer_size == 0) ? nullptr : u_buffer + info.uv_buffer_size;
+ status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer, buffer,
+ frame_buffer);
+ if (status != kStatusOk) return status;
+ buffer->in_use = true;
+ return kStatusOk;
+}
+
+void InternalFrameBufferList::ReleaseFrameBuffer(void* buffer_private_data) {
+ auto* const buffer = static_cast<Buffer*>(buffer_private_data);
+ buffer->in_use = false;
+}
+
+} // namespace libgav1
diff --git a/src/internal_frame_buffer_list.h b/src/internal_frame_buffer_list.h
new file mode 100644
index 0000000..1c50b48
--- /dev/null
+++ b/src/internal_frame_buffer_list.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+#define LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+extern "C" Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment);
+
+extern "C" Libgav1StatusCode GetInternalFrameBuffer(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+extern "C" void ReleaseInternalFrameBuffer(void* callback_private_data,
+ void* buffer_private_data);
+
+class InternalFrameBufferList : public Allocable {
+ public:
+ InternalFrameBufferList() = default;
+
+ // Not copyable or movable.
+ InternalFrameBufferList(const InternalFrameBufferList&) = delete;
+ InternalFrameBufferList& operator=(const InternalFrameBufferList&) = delete;
+
+ ~InternalFrameBufferList() = default;
+
+ Libgav1StatusCode OnFrameBufferSizeChanged(int bitdepth,
+ Libgav1ImageFormat image_format,
+ int width, int height,
+ int left_border, int right_border,
+ int top_border, int bottom_border,
+ int stride_alignment);
+
+ Libgav1StatusCode GetFrameBuffer(int bitdepth,
+ Libgav1ImageFormat image_format, int width,
+ int height, int left_border,
+ int right_border, int top_border,
+ int bottom_border, int stride_alignment,
+ Libgav1FrameBuffer* frame_buffer);
+
+ void ReleaseFrameBuffer(void* buffer_private_data);
+
+ private:
+ struct Buffer : public Allocable {
+ std::unique_ptr<uint8_t[], MallocDeleter> data;
+ size_t size = 0;
+ bool in_use = false;
+ };
+
+ Vector<std::unique_ptr<Buffer>> buffers_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
diff --git a/src/libgav1_decoder.cmake b/src/libgav1_decoder.cmake
new file mode 100644
index 0000000..b97d09d
--- /dev/null
+++ b/src/libgav1_decoder.cmake
@@ -0,0 +1,157 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_)
+ return()
+endif() # LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_
+set(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_ 1)
+
+list(APPEND libgav1_decoder_sources
+ "${libgav1_source}/buffer_pool.cc"
+ "${libgav1_source}/buffer_pool.h"
+ "${libgav1_source}/decoder_impl.cc"
+ "${libgav1_source}/decoder_impl.h"
+ "${libgav1_source}/decoder_state.h"
+ "${libgav1_source}/tile_scratch_buffer.cc"
+ "${libgav1_source}/tile_scratch_buffer.h"
+ "${libgav1_source}/film_grain.cc"
+ "${libgav1_source}/film_grain.h"
+ "${libgav1_source}/frame_buffer.cc"
+ "${libgav1_source}/frame_buffer_utils.h"
+ "${libgav1_source}/frame_scratch_buffer.h"
+ "${libgav1_source}/inter_intra_masks.inc"
+ "${libgav1_source}/internal_frame_buffer_list.cc"
+ "${libgav1_source}/internal_frame_buffer_list.h"
+ "${libgav1_source}/loop_restoration_info.cc"
+ "${libgav1_source}/loop_restoration_info.h"
+ "${libgav1_source}/motion_vector.cc"
+ "${libgav1_source}/motion_vector.h"
+ "${libgav1_source}/obu_parser.cc"
+ "${libgav1_source}/obu_parser.h"
+ "${libgav1_source}/post_filter/cdef.cc"
+ "${libgav1_source}/post_filter/deblock.cc"
+ "${libgav1_source}/post_filter/deblock_thresholds.inc"
+ "${libgav1_source}/post_filter/loop_restoration.cc"
+ "${libgav1_source}/post_filter/post_filter.cc"
+ "${libgav1_source}/post_filter/super_res.cc"
+ "${libgav1_source}/post_filter.h"
+ "${libgav1_source}/prediction_mask.cc"
+ "${libgav1_source}/prediction_mask.h"
+ "${libgav1_source}/quantizer.cc"
+ "${libgav1_source}/quantizer.h"
+ "${libgav1_source}/quantizer_tables.inc"
+ "${libgav1_source}/reconstruction.cc"
+ "${libgav1_source}/reconstruction.h"
+ "${libgav1_source}/residual_buffer_pool.cc"
+ "${libgav1_source}/residual_buffer_pool.h"
+ "${libgav1_source}/scan_tables.inc"
+ "${libgav1_source}/symbol_decoder_context.cc"
+ "${libgav1_source}/symbol_decoder_context.h"
+ "${libgav1_source}/symbol_decoder_context_cdfs.inc"
+ "${libgav1_source}/threading_strategy.cc"
+ "${libgav1_source}/threading_strategy.h"
+ "${libgav1_source}/tile.h"
+ "${libgav1_source}/tile/bitstream/mode_info.cc"
+ "${libgav1_source}/tile/bitstream/palette.cc"
+ "${libgav1_source}/tile/bitstream/partition.cc"
+ "${libgav1_source}/tile/bitstream/transform_size.cc"
+ "${libgav1_source}/tile/prediction.cc"
+ "${libgav1_source}/tile/tile.cc"
+ "${libgav1_source}/warp_prediction.cc"
+ "${libgav1_source}/warp_prediction.h"
+ "${libgav1_source}/yuv_buffer.cc"
+ "${libgav1_source}/yuv_buffer.h")
+
+list(APPEND libgav1_api_includes "${libgav1_source}/gav1/decoder.h"
+ "${libgav1_source}/gav1/decoder_buffer.h"
+ "${libgav1_source}/gav1/decoder_settings.h"
+ "${libgav1_source}/gav1/frame_buffer.h"
+ "${libgav1_source}/gav1/status_code.h"
+ "${libgav1_source}/gav1/symbol_visibility.h"
+ "${libgav1_source}/gav1/version.h")
+
+list(APPEND libgav1_api_sources "${libgav1_source}/decoder.cc"
+ "${libgav1_source}/decoder_settings.cc"
+ "${libgav1_source}/status_code.cc"
+ "${libgav1_source}/version.cc"
+ ${libgav1_api_includes})
+
+macro(libgav1_add_decoder_targets)
+ if(BUILD_SHARED_LIBS)
+ if(MSVC OR WIN32)
+ # In order to produce a DLL and import library the Windows tools require
+ # that the exported symbols are part of the DLL target. The unfortunate
+ # side effect of this is that a single configuration cannot output both
+ # the static library and the DLL: This results in an either/or situation.
+ # Windows users of the libgav1 build can have a DLL and an import library,
+ # or they can have a static library; they cannot have both from a single
+ # configuration of the build.
+ list(APPEND libgav1_shared_lib_sources ${libgav1_api_sources})
+ list(APPEND libgav1_static_lib_sources ${libgav1_api_includes})
+ else()
+ list(APPEND libgav1_shared_lib_sources ${libgav1_api_includes})
+ list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+ endif()
+ else()
+ list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+ endif()
+
+ if(NOT ANDROID)
+ list(APPEND libgav1_absl_deps absl::base absl::synchronization)
+ endif()
+
+ libgav1_add_library(NAME libgav1_decoder TYPE OBJECT SOURCES
+ ${libgav1_decoder_sources} DEFINES ${libgav1_defines}
+ INCLUDES ${libgav1_include_paths})
+
+ libgav1_add_library(NAME
+ libgav1_static
+ OUTPUT_NAME
+ libgav1
+ TYPE
+ STATIC
+ SOURCES
+ ${libgav1_static_lib_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ LIB_DEPS
+ ${libgav1_absl_deps}
+ OBJLIB_DEPS
+ libgav1_dsp
+ libgav1_decoder
+ libgav1_utils
+ PUBLIC_INCLUDES
+ ${libgav1_source})
+
+ if(BUILD_SHARED_LIBS)
+ libgav1_add_library(NAME
+ libgav1_shared
+ OUTPUT_NAME
+ libgav1
+ TYPE
+ SHARED
+ SOURCES
+ ${libgav1_shared_lib_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ LIB_DEPS
+ libgav1_static
+ PUBLIC_INCLUDES
+ ${libgav1_source})
+ endif()
+endmacro()
diff --git a/src/loop_restoration_info.cc b/src/loop_restoration_info.cc
new file mode 100644
index 0000000..2dba57d
--- /dev/null
+++ b/src/loop_restoration_info.cc
@@ -0,0 +1,240 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/loop_restoration_info.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Controls how self guided deltas are read.
+constexpr int kSgrProjReadControl = 4;
+// Maps the restoration type encoded in the compressed headers (restoration_type
+// element in the spec) of the bitstream to LoopRestorationType. This is used
+// only when the restoration type in the frame header is
+// LoopRestorationTypeSwitchable.
+constexpr LoopRestorationType kBitstreamRestorationTypeMap[] = {
+ kLoopRestorationTypeNone, kLoopRestorationTypeWiener,
+ kLoopRestorationTypeSgrProj};
+
+inline int CountLeadingZeroCoefficients(const int16_t* const filter) {
+ int number_zero_coefficients = 0;
+ if (filter[0] == 0) {
+ number_zero_coefficients++;
+ if (filter[1] == 0) {
+ number_zero_coefficients++;
+ if (filter[2] == 0) {
+ number_zero_coefficients++;
+ }
+ }
+ }
+ return number_zero_coefficients;
+}
+
+} // namespace
+
+bool LoopRestorationInfo::Reset(const LoopRestoration* const loop_restoration,
+ uint32_t width, uint32_t height,
+ int8_t subsampling_x, int8_t subsampling_y,
+ bool is_monochrome) {
+ loop_restoration_ = loop_restoration;
+ subsampling_x_ = subsampling_x;
+ subsampling_y_ = subsampling_y;
+
+ const int num_planes = is_monochrome ? kMaxPlanesMonochrome : kMaxPlanes;
+ int total_num_units = 0;
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
+ if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+ plane_needs_filtering_[plane] = false;
+ continue;
+ }
+ plane_needs_filtering_[plane] = true;
+ const int plane_width =
+ (plane == kPlaneY) ? width : SubsampledValue(width, subsampling_x_);
+ const int plane_height =
+ (plane == kPlaneY) ? height : SubsampledValue(height, subsampling_y_);
+ num_horizontal_units_[plane] =
+ std::max(1, RightShiftWithRounding(
+ plane_width, loop_restoration_->unit_size_log2[plane]));
+ num_vertical_units_[plane] = std::max(
+ 1, RightShiftWithRounding(plane_height,
+ loop_restoration_->unit_size_log2[plane]));
+ num_units_[plane] =
+ num_horizontal_units_[plane] * num_vertical_units_[plane];
+ total_num_units += num_units_[plane];
+ }
+ // Allocate the RestorationUnitInfo arrays for all planes in a single heap
+ // allocation and divide up the buffer into arrays of the right sizes.
+ if (!loop_restoration_info_buffer_.Resize(total_num_units)) {
+ return false;
+ }
+ RestorationUnitInfo* loop_restoration_info =
+ loop_restoration_info_buffer_.get();
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
+ if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ loop_restoration_info_[plane] = loop_restoration_info;
+ loop_restoration_info += num_units_[plane];
+ }
+ return true;
+}
+
+bool LoopRestorationInfo::PopulateUnitInfoForSuperBlock(
+ Plane plane, BlockSize block_size, bool is_superres_scaled,
+ uint8_t superres_scale_denominator, int row4x4, int column4x4,
+ LoopRestorationUnitInfo* const unit_info) const {
+ assert(unit_info != nullptr);
+ if (!plane_needs_filtering_[plane]) return false;
+ const int numerator_column =
+ is_superres_scaled ? superres_scale_denominator : 1;
+ const int pixel_column_start =
+ RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_);
+ const int pixel_column_end = RowOrColumn4x4ToPixel(
+ column4x4 + kNum4x4BlocksWide[block_size], plane, subsampling_x_);
+ const int unit_row_log2 = loop_restoration_->unit_size_log2[plane];
+ const int denominator_column_log2 =
+ unit_row_log2 + (is_superres_scaled ? 3 : 0);
+ const int pixel_row_start =
+ RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_);
+ const int pixel_row_end = RowOrColumn4x4ToPixel(
+ row4x4 + kNum4x4BlocksHigh[block_size], plane, subsampling_y_);
+ unit_info->column_start = RightShiftWithCeiling(
+ pixel_column_start * numerator_column, denominator_column_log2);
+ unit_info->column_end = RightShiftWithCeiling(
+ pixel_column_end * numerator_column, denominator_column_log2);
+ unit_info->row_start = RightShiftWithCeiling(pixel_row_start, unit_row_log2);
+ unit_info->row_end = RightShiftWithCeiling(pixel_row_end, unit_row_log2);
+ unit_info->column_end =
+ std::min(unit_info->column_end, num_horizontal_units_[plane]);
+ unit_info->row_end = std::min(unit_info->row_end, num_vertical_units_[plane]);
+ return true;
+}
+
+void LoopRestorationInfo::ReadUnitCoefficients(
+ DaalaBitReader* const reader,
+ SymbolDecoderContext* const symbol_decoder_context, Plane plane,
+ int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+ LoopRestorationType unit_restoration_type = kLoopRestorationTypeNone;
+ if (loop_restoration_->type[plane] == kLoopRestorationTypeSwitchable) {
+ unit_restoration_type = kBitstreamRestorationTypeMap
+ [reader->ReadSymbol<kRestorationTypeSymbolCount>(
+ symbol_decoder_context->restoration_type_cdf)];
+ } else if (loop_restoration_->type[plane] == kLoopRestorationTypeWiener) {
+ const bool use_wiener =
+ reader->ReadSymbol(symbol_decoder_context->use_wiener_cdf);
+ if (use_wiener) unit_restoration_type = kLoopRestorationTypeWiener;
+ } else if (loop_restoration_->type[plane] == kLoopRestorationTypeSgrProj) {
+ const bool use_sgrproj =
+ reader->ReadSymbol(symbol_decoder_context->use_sgrproj_cdf);
+ if (use_sgrproj) unit_restoration_type = kLoopRestorationTypeSgrProj;
+ }
+ loop_restoration_info_[plane][unit_id].type = unit_restoration_type;
+
+ if (unit_restoration_type == kLoopRestorationTypeWiener) {
+ ReadWienerInfo(reader, plane, unit_id, reference_unit_info);
+ } else if (unit_restoration_type == kLoopRestorationTypeSgrProj) {
+ ReadSgrProjInfo(reader, plane, unit_id, reference_unit_info);
+ }
+}
+
+void LoopRestorationInfo::ReadWienerInfo(
+ DaalaBitReader* const reader, Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+ for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+ if (plane != kPlaneY) {
+ loop_restoration_info_[plane][unit_id].wiener_info.filter[i][0] = 0;
+ }
+ int sum = 0;
+ for (int j = static_cast<int>(plane != kPlaneY); j < kNumWienerCoefficients;
+ ++j) {
+ const int8_t wiener_min = kWienerTapsMin[j];
+ const int8_t wiener_max = kWienerTapsMax[j];
+ const int control = j + 1;
+ int value;
+ if (!reader->DecodeSignedSubexpWithReference(
+ wiener_min, wiener_max + 1,
+ (*reference_unit_info)[plane].wiener_info.filter[i][j], control,
+ &value)) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Error decoding Wiener filter coefficients: plane %d, unit_id %d",
+ static_cast<int>(plane), unit_id);
+ return;
+ }
+ loop_restoration_info_[plane][unit_id].wiener_info.filter[i][j] = value;
+ (*reference_unit_info)[plane].wiener_info.filter[i][j] = value;
+ sum += value;
+ }
+ loop_restoration_info_[plane][unit_id].wiener_info.filter[i][3] =
+ 128 - 2 * sum;
+ loop_restoration_info_[plane][unit_id]
+ .wiener_info.number_leading_zero_coefficients[i] =
+ CountLeadingZeroCoefficients(
+ loop_restoration_info_[plane][unit_id].wiener_info.filter[i]);
+ }
+}
+
+void LoopRestorationInfo::ReadSgrProjInfo(
+ DaalaBitReader* const reader, Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+ const int sgr_proj_index =
+ static_cast<int>(reader->ReadLiteral(kSgrProjParamsBits));
+ loop_restoration_info_[plane][unit_id].sgr_proj_info.index = sgr_proj_index;
+ for (int i = 0; i < 2; ++i) {
+ const uint8_t radius = kSgrProjParams[sgr_proj_index][i * 2];
+ const int8_t multiplier_min = kSgrProjMultiplierMin[i];
+ const int8_t multiplier_max = kSgrProjMultiplierMax[i];
+ int multiplier;
+ if (radius != 0) {
+ if (!reader->DecodeSignedSubexpWithReference(
+ multiplier_min, multiplier_max + 1,
+ (*reference_unit_info)[plane].sgr_proj_info.multiplier[i],
+ kSgrProjReadControl, &multiplier)) {
+ LIBGAV1_DLOG(ERROR,
+ "Error decoding Self-guided filter coefficients: plane "
+ "%d, unit_id %d",
+ static_cast<int>(plane), unit_id);
+ return;
+ }
+ } else {
+ // The range of (*reference_unit_info)[plane].sgr_proj_info.multiplier[0]
+ // from DecodeSignedSubexpWithReference() is [-96, 31], the default is
+ // -32, making Clip3(128 - 31, -32, 95) unnecessary.
+ static constexpr int kMultiplier[2] = {0, 95};
+ multiplier = kMultiplier[i];
+ assert(
+ i == 0 ||
+ Clip3((1 << kSgrProjPrecisionBits) -
+ (*reference_unit_info)[plane].sgr_proj_info.multiplier[0],
+ multiplier_min, multiplier_max) == kMultiplier[1]);
+ }
+ loop_restoration_info_[plane][unit_id].sgr_proj_info.multiplier[i] =
+ multiplier;
+ (*reference_unit_info)[plane].sgr_proj_info.multiplier[i] = multiplier;
+ }
+}
+
+} // namespace libgav1
diff --git a/src/loop_restoration_info.h b/src/loop_restoration_info.h
new file mode 100644
index 0000000..f174b89
--- /dev/null
+++ b/src/loop_restoration_info.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+#define LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "src/dsp/common.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct LoopRestorationUnitInfo {
+ int row_start;
+ int row_end;
+ int column_start;
+ int column_end;
+};
+
+class LoopRestorationInfo {
+ public:
+ LoopRestorationInfo() = default;
+
+ // Non copyable/movable.
+ LoopRestorationInfo(const LoopRestorationInfo&) = delete;
+ LoopRestorationInfo& operator=(const LoopRestorationInfo&) = delete;
+ LoopRestorationInfo(LoopRestorationInfo&&) = delete;
+ LoopRestorationInfo& operator=(LoopRestorationInfo&&) = delete;
+
+ bool Reset(const LoopRestoration* loop_restoration, uint32_t width,
+ uint32_t height, int8_t subsampling_x, int8_t subsampling_y,
+ bool is_monochrome);
+ // Populates the |unit_info| for the super block at |row4x4|, |column4x4|.
+ // Returns true on success, false otherwise.
+ bool PopulateUnitInfoForSuperBlock(Plane plane, BlockSize block_size,
+ bool is_superres_scaled,
+ uint8_t superres_scale_denominator,
+ int row4x4, int column4x4,
+ LoopRestorationUnitInfo* unit_info) const;
+ void ReadUnitCoefficients(DaalaBitReader* reader,
+ SymbolDecoderContext* symbol_decoder_context,
+ Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>*
+ reference_unit_info); // 5.11.58.
+ void ReadWienerInfo(
+ DaalaBitReader* reader, Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+ void ReadSgrProjInfo(
+ DaalaBitReader* reader, Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+
+ // Getters.
+ const RestorationUnitInfo* loop_restoration_info(Plane plane,
+ int unit_id) const {
+ return &loop_restoration_info_[plane][unit_id];
+ }
+
+ int num_horizontal_units(Plane plane) const {
+ return num_horizontal_units_[plane];
+ }
+ int num_vertical_units(Plane plane) const {
+ return num_vertical_units_[plane];
+ }
+ int num_units(Plane plane) const { return num_units_[plane]; }
+
+ private:
+ // If plane_needs_filtering_[plane] is true, loop_restoration_info_[plane]
+ // points to an array of num_units_[plane] elements.
+ RestorationUnitInfo* loop_restoration_info_[kMaxPlanes];
+ // Owns the memory that loop_restoration_info_[plane] points to.
+ DynamicBuffer<RestorationUnitInfo> loop_restoration_info_buffer_;
+ bool plane_needs_filtering_[kMaxPlanes];
+ const LoopRestoration* loop_restoration_;
+ int8_t subsampling_x_;
+ int8_t subsampling_y_;
+ int num_horizontal_units_[kMaxPlanes];
+ int num_vertical_units_[kMaxPlanes];
+ int num_units_[kMaxPlanes];
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
diff --git a/src/motion_vector.cc b/src/motion_vector.cc
new file mode 100644
index 0000000..fdb1875
--- /dev/null
+++ b/src/motion_vector.cc
@@ -0,0 +1,1001 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/motion_vector.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Entry at index i is computed as:
+// Clip3(std::max(kBlockWidthPixels[i], kBlockHeightPixels[i], 16, 112)).
+constexpr int kWarpValidThreshold[kMaxBlockSizes] = {
+ 16, 16, 16, 16, 16, 16, 32, 16, 16, 16, 32,
+ 64, 32, 32, 32, 64, 64, 64, 64, 112, 112, 112};
+
+// 7.10.2.10.
+void LowerMvPrecision(const ObuFrameHeader& frame_header,
+ MotionVector* const mvs) {
+ if (frame_header.allow_high_precision_mv) return;
+ if (frame_header.force_integer_mv != 0) {
+ for (auto& mv : mvs->mv) {
+ // The next line is equivalent to:
+ // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+ // const int sign = mv >> 15;
+ // mv = ApplySign(value, sign);
+ mv = (mv + 3 - (mv >> 15)) & ~7;
+ }
+ } else {
+ for (auto& mv : mvs->mv) {
+ // The next line is equivalent to:
+ // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+ mv = (mv - (mv >> 15)) & ~1;
+ }
+ }
+}
+
+// 7.10.2.1.
+void SetupGlobalMv(const Tile::Block& block, int index,
+ MotionVector* const mv) {
+ const BlockParameters& bp = *block.bp;
+ const ObuFrameHeader& frame_header = block.tile.frame_header();
+ ReferenceFrameType reference_type = bp.reference_frame[index];
+ const auto& gm = frame_header.global_motion[reference_type];
+ if (reference_type == kReferenceFrameIntra ||
+ gm.type == kGlobalMotionTransformationTypeIdentity) {
+ mv->mv32 = 0;
+ return;
+ }
+ if (gm.type == kGlobalMotionTransformationTypeTranslation) {
+ for (int i = 0; i < 2; ++i) {
+ mv->mv[i] = gm.params[i] >> (kWarpedModelPrecisionBits - 3);
+ }
+ LowerMvPrecision(frame_header, mv);
+ return;
+ }
+ const int x = MultiplyBy4(block.column4x4) + DivideBy2(block.width) - 1;
+ const int y = MultiplyBy4(block.row4x4) + DivideBy2(block.height) - 1;
+ const int xc = (gm.params[2] - (1 << kWarpedModelPrecisionBits)) * x +
+ gm.params[3] * y + gm.params[0];
+ const int yc = gm.params[4] * x +
+ (gm.params[5] - (1 << kWarpedModelPrecisionBits)) * y +
+ gm.params[1];
+ if (frame_header.allow_high_precision_mv) {
+ mv->mv[MotionVector::kRow] =
+ RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 3);
+ mv->mv[MotionVector::kColumn] =
+ RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 3);
+ } else {
+ mv->mv[MotionVector::kRow] = MultiplyBy2(
+ RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 2));
+ mv->mv[MotionVector::kColumn] = MultiplyBy2(
+ RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 2));
+ LowerMvPrecision(frame_header, mv);
+ }
+}
+
+constexpr BitMaskSet kPredictionModeNewMvMask(kPredictionModeNewMv,
+ kPredictionModeNewNewMv,
+ kPredictionModeNearNewMv,
+ kPredictionModeNewNearMv,
+ kPredictionModeNearestNewMv,
+ kPredictionModeNewNearestMv);
+
+// 7.10.2.8.
+void SearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+ int index, int weight, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const BlockParameters& bp = *block.bp;
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+ block.tile.frame_header().global_motion;
+ PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ MotionVector candidate_mv;
+ // LowerMvPrecision() is not necessary, since the values in
+ // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+ const auto global_motion_type = global_motion[bp.reference_frame[0]].type;
+ if (IsGlobalMvBlock(mv_bp.is_global_mv_block, global_motion_type)) {
+ candidate_mv = prediction_parameters.global_mv[0];
+ } else {
+ candidate_mv = mv_bp.mv.mv[index];
+ }
+ *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+ *found_match = true;
+ MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+ const int num_found = *num_mv_found;
+ const auto result = std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+ [&candidate_mv](const MotionVector& ref_mv) {
+ return ref_mv == candidate_mv;
+ });
+ if (result != ref_mv_stack + num_found) {
+ prediction_parameters.IncreaseWeight(std::distance(ref_mv_stack, result),
+ weight);
+ return;
+ }
+ if (num_found >= kMaxRefMvStackSize) return;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+ ++*num_mv_found;
+}
+
+// 7.10.2.9.
+void CompoundSearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+ int weight, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const BlockParameters& bp = *block.bp;
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+ block.tile.frame_header().global_motion;
+ PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ // LowerMvPrecision() is not necessary, since the values in
+ // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+ CompoundMotionVector candidate_mv = mv_bp.mv;
+ for (int i = 0; i < 2; ++i) {
+ const auto global_motion_type = global_motion[bp.reference_frame[i]].type;
+ if (IsGlobalMvBlock(mv_bp.is_global_mv_block, global_motion_type)) {
+ candidate_mv.mv[i] = prediction_parameters.global_mv[i];
+ }
+ }
+ *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+ *found_match = true;
+ CompoundMotionVector* const compound_ref_mv_stack =
+ prediction_parameters.compound_ref_mv_stack;
+ const int num_found = *num_mv_found;
+ const auto result =
+ std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+ [&candidate_mv](const CompoundMotionVector& ref_mv) {
+ return ref_mv == candidate_mv;
+ });
+ if (result != compound_ref_mv_stack + num_found) {
+ prediction_parameters.IncreaseWeight(
+ std::distance(compound_ref_mv_stack, result), weight);
+ return;
+ }
+ if (num_found >= kMaxRefMvStackSize) return;
+ compound_ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+ ++*num_mv_found;
+}
+
+// 7.10.2.7.
+void AddReferenceMvCandidate(const Tile::Block& block,
+ const BlockParameters& mv_bp, bool is_compound,
+ int weight, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ if (!mv_bp.is_inter) return;
+ const BlockParameters& bp = *block.bp;
+ if (is_compound) {
+ if (mv_bp.reference_frame[0] == bp.reference_frame[0] &&
+ mv_bp.reference_frame[1] == bp.reference_frame[1]) {
+ CompoundSearchStack(block, mv_bp, weight, found_new_mv, found_match,
+ num_mv_found);
+ }
+ return;
+ }
+ for (int i = 0; i < 2; ++i) {
+ if (mv_bp.reference_frame[i] == bp.reference_frame[0]) {
+ SearchStack(block, mv_bp, i, weight, found_new_mv, found_match,
+ num_mv_found);
+ }
+ }
+}
+
+int GetMinimumStep(int block_width_or_height4x4, int delta_row_or_column) {
+ assert(delta_row_or_column < 0);
+ if (block_width_or_height4x4 >= 16) return 4;
+ if (delta_row_or_column < -1) return 2;
+ return 0;
+}
+
+// 7.10.2.2.
+void ScanRow(const Tile::Block& block, int mv_column, int delta_row,
+ bool is_compound, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const int mv_row = block.row4x4 + delta_row;
+ const Tile& tile = block.tile;
+ if (!tile.IsTopInside(mv_row + 1)) return;
+ const int width4x4 = block.width4x4;
+ const int min_step = GetMinimumStep(width4x4, delta_row);
+ BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+ BlockParameters** const end_bps =
+ bps + std::min({static_cast<int>(width4x4),
+ tile.frame_header().columns4x4 - block.column4x4, 16});
+ do {
+ const BlockParameters& mv_bp = **bps;
+ const int step = std::max(
+ std::min(width4x4, static_cast<int>(kNum4x4BlocksWide[mv_bp.size])),
+ min_step);
+ AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+ found_new_mv, found_match, num_mv_found);
+ bps += step;
+ } while (bps < end_bps);
+}
+
+// 7.10.2.3.
+void ScanColumn(const Tile::Block& block, int mv_row, int delta_column,
+ bool is_compound, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const int mv_column = block.column4x4 + delta_column;
+ const Tile& tile = block.tile;
+ if (!tile.IsLeftInside(mv_column + 1)) return;
+ const int height4x4 = block.height4x4;
+ const int min_step = GetMinimumStep(height4x4, delta_column);
+ const ptrdiff_t stride = tile.BlockParametersStride();
+ BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+ BlockParameters** const end_bps =
+ bps + stride * std::min({static_cast<int>(height4x4),
+ tile.frame_header().rows4x4 - block.row4x4, 16});
+ do {
+ const BlockParameters& mv_bp = **bps;
+ const int step = std::max(
+ std::min(height4x4, static_cast<int>(kNum4x4BlocksHigh[mv_bp.size])),
+ min_step);
+ AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+ found_new_mv, found_match, num_mv_found);
+ bps += step * stride;
+ } while (bps < end_bps);
+}
+
+// 7.10.2.4.
+void ScanPoint(const Tile::Block& block, int delta_row, int delta_column,
+ bool is_compound, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const int mv_row = block.row4x4 + delta_row;
+ const int mv_column = block.column4x4 + delta_column;
+ const Tile& tile = block.tile;
+ if (!tile.IsInside(mv_row, mv_column) ||
+ !tile.HasParameters(mv_row, mv_column)) {
+ return;
+ }
+ const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+ if (mv_bp.reference_frame[0] == kReferenceFrameNone) return;
+ AddReferenceMvCandidate(block, mv_bp, is_compound, 4, found_new_mv,
+ found_match, num_mv_found);
+}
+
+// 7.10.2.6.
+void AddTemporalReferenceMvCandidate(
+ const ObuFrameHeader& frame_header, const int reference_offsets[2],
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets, int count, bool is_compound,
+ int* const zero_mv_context, int* const num_mv_found,
+ PredictionParameters* const prediction_parameters) {
+ const int mv_projection_function_index =
+ frame_header.allow_high_precision_mv ? 2 : frame_header.force_integer_mv;
+ const MotionVector* const global_mv = prediction_parameters->global_mv;
+ if (is_compound) {
+ CompoundMotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+ const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+ dsp.mv_projection_compound[mv_projection_function_index](
+ temporal_mvs, temporal_reference_offsets, reference_offsets, count,
+ candidate_mvs);
+ if (*zero_mv_context == -1) {
+ int max_difference =
+ std::max(std::abs(candidate_mvs[0].mv[0].mv[0] - global_mv[0].mv[0]),
+ std::abs(candidate_mvs[0].mv[0].mv[1] - global_mv[0].mv[1]));
+ max_difference =
+ std::max(max_difference,
+ std::abs(candidate_mvs[0].mv[1].mv[0] - global_mv[1].mv[0]));
+ max_difference =
+ std::max(max_difference,
+ std::abs(candidate_mvs[0].mv[1].mv[1] - global_mv[1].mv[1]));
+ *zero_mv_context = static_cast<int>(max_difference >= 16);
+ }
+ CompoundMotionVector* const compound_ref_mv_stack =
+ prediction_parameters->compound_ref_mv_stack;
+ int num_found = *num_mv_found;
+ int index = 0;
+ do {
+ const CompoundMotionVector& candidate_mv = candidate_mvs[index];
+ const auto result =
+ std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+ [&candidate_mv](const CompoundMotionVector& ref_mv) {
+ return ref_mv == candidate_mv;
+ });
+ if (result != compound_ref_mv_stack + num_found) {
+ prediction_parameters->IncreaseWeight(
+ std::distance(compound_ref_mv_stack, result), 2);
+ continue;
+ }
+ if (num_found >= kMaxRefMvStackSize) continue;
+ compound_ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+ ++num_found;
+ } while (++index < count);
+ *num_mv_found = num_found;
+ return;
+ }
+ MotionVector* const ref_mv_stack = prediction_parameters->ref_mv_stack;
+ if (reference_offsets[0] == 0) {
+ if (*zero_mv_context == -1) {
+ const int max_difference =
+ std::max(std::abs(global_mv[0].mv[0]), std::abs(global_mv[0].mv[1]));
+ *zero_mv_context = static_cast<int>(max_difference >= 16);
+ }
+ const MotionVector candidate_mv = {};
+ const int num_found = *num_mv_found;
+ const auto result =
+ std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+ [&candidate_mv](const MotionVector& ref_mv) {
+ return ref_mv == candidate_mv;
+ });
+ if (result != ref_mv_stack + num_found) {
+ prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+ 2 * count);
+ return;
+ }
+ if (num_found >= kMaxRefMvStackSize) return;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2 * count);
+ ++*num_mv_found;
+ return;
+ }
+ alignas(kMaxAlignment)
+ MotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+ const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+ dsp.mv_projection_single[mv_projection_function_index](
+ temporal_mvs, temporal_reference_offsets, reference_offsets[0], count,
+ candidate_mvs);
+ if (*zero_mv_context == -1) {
+ const int max_difference =
+ std::max(std::abs(candidate_mvs[0].mv[0] - global_mv[0].mv[0]),
+ std::abs(candidate_mvs[0].mv[1] - global_mv[0].mv[1]));
+ *zero_mv_context = static_cast<int>(max_difference >= 16);
+ }
+ int num_found = *num_mv_found;
+ int index = 0;
+ do {
+ const MotionVector& candidate_mv = candidate_mvs[index];
+ const auto result =
+ std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+ [&candidate_mv](const MotionVector& ref_mv) {
+ return ref_mv == candidate_mv;
+ });
+ if (result != ref_mv_stack + num_found) {
+ prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+ 2);
+ continue;
+ }
+ if (num_found >= kMaxRefMvStackSize) continue;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+ ++num_found;
+ } while (++index < count);
+ *num_mv_found = num_found;
+}
+
+// Part of 7.10.2.5.
+bool IsWithinTheSame64x64Block(const Tile::Block& block, int delta_row,
+ int delta_column) {
+ const int row = (block.row4x4 & 15) + delta_row;
+ const int column = (block.column4x4 & 15) + delta_column;
+ // |block.height4x4| is at least 2 for all elements in |kTemporalScanMask|.
+ // So |row| are all non-negative.
+ assert(row >= 0);
+ return row < 16 && column >= 0 && column < 16;
+}
+
+constexpr BitMaskSet kTemporalScanMask(kBlock8x8, kBlock8x16, kBlock8x32,
+ kBlock16x8, kBlock16x16, kBlock16x32,
+ kBlock32x8, kBlock32x16, kBlock32x32);
+
+// 7.10.2.5.
+void TemporalScan(const Tile::Block& block, bool is_compound,
+ int* const zero_mv_context, int* const num_mv_found) {
+ const int step_w = (block.width4x4 >= 16) ? 4 : 2;
+ const int step_h = (block.height4x4 >= 16) ? 4 : 2;
+ const int row_start = block.row4x4 | 1;
+ const int column_start = block.column4x4 | 1;
+ const int row_end =
+ row_start + std::min(static_cast<int>(block.height4x4), 16);
+ const int column_end =
+ column_start + std::min(static_cast<int>(block.width4x4), 16);
+ const Tile& tile = block.tile;
+ const TemporalMotionField& motion_field = tile.motion_field();
+ const int stride = motion_field.mv.columns();
+ const MotionVector* motion_field_mv = motion_field.mv[0];
+ const int8_t* motion_field_reference_offset =
+ motion_field.reference_offset[0];
+ alignas(kMaxAlignment)
+ MotionVector temporal_mvs[kMaxTemporalMvCandidatesWithPadding];
+ int8_t temporal_reference_offsets[kMaxTemporalMvCandidatesWithPadding];
+ int count = 0;
+ int offset = stride * (row_start >> 1);
+ int mv_row = row_start;
+ do {
+ int mv_column = column_start;
+ do {
+ // Both horizontal and vertical offsets are positive. Only bottom and
+ // right boundaries need to be checked.
+ if (tile.IsBottomRightInside(mv_row, mv_column)) {
+ const int x8 = mv_column >> 1;
+ const MotionVector temporal_mv = motion_field_mv[offset + x8];
+ if (temporal_mv.mv[0] == kInvalidMvValue) {
+ if (mv_row == row_start && mv_column == column_start) {
+ *zero_mv_context = 1;
+ }
+ } else {
+ temporal_mvs[count] = temporal_mv;
+ temporal_reference_offsets[count++] =
+ motion_field_reference_offset[offset + x8];
+ }
+ }
+ mv_column += step_w;
+ } while (mv_column < column_end);
+ offset += stride * step_h >> 1;
+ mv_row += step_h;
+ } while (mv_row < row_end);
+ if (kTemporalScanMask.Contains(block.size)) {
+ const int temporal_sample_positions[3][2] = {
+ {block.height4x4, -2},
+ {block.height4x4, block.width4x4},
+ {block.height4x4 - 2, block.width4x4}};
+ // Getting the address of an element in Array2D is slow. Precalculate the
+ // offsets.
+ int temporal_sample_offsets[3];
+ temporal_sample_offsets[0] = stride * ((row_start + block.height4x4) >> 1) +
+ ((column_start - 2) >> 1);
+ temporal_sample_offsets[1] =
+ temporal_sample_offsets[0] + ((block.width4x4 + 2) >> 1);
+ temporal_sample_offsets[2] = temporal_sample_offsets[1] - stride;
+ for (int i = 0; i < 3; i++) {
+ const int row = temporal_sample_positions[i][0];
+ const int column = temporal_sample_positions[i][1];
+ if (!IsWithinTheSame64x64Block(block, row, column)) continue;
+ const int mv_row = row_start + row;
+ const int mv_column = column_start + column;
+ // IsWithinTheSame64x64Block() guarantees the reference block is inside
+ // the top and left boundary.
+ if (!tile.IsBottomRightInside(mv_row, mv_column)) continue;
+ const MotionVector temporal_mv =
+ motion_field_mv[temporal_sample_offsets[i]];
+ if (temporal_mv.mv[0] != kInvalidMvValue) {
+ temporal_mvs[count] = temporal_mv;
+ temporal_reference_offsets[count++] =
+ motion_field_reference_offset[temporal_sample_offsets[i]];
+ }
+ }
+ }
+ if (count != 0) {
+ BlockParameters* const bp = block.bp;
+ int reference_offsets[2];
+ const int offset_0 = tile.current_frame()
+ .reference_info()
+ ->relative_distance_to[bp->reference_frame[0]];
+ reference_offsets[0] =
+ Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance);
+ if (is_compound) {
+ const int offset_1 = tile.current_frame()
+ .reference_info()
+ ->relative_distance_to[bp->reference_frame[1]];
+ reference_offsets[1] =
+ Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance);
+ // Pad so that SIMD implementations won't read uninitialized memory.
+ if ((count & 1) != 0) {
+ temporal_mvs[count].mv32 = 0;
+ temporal_reference_offsets[count] = 0;
+ }
+ } else {
+ // Pad so that SIMD implementations won't read uninitialized memory.
+ for (int i = count; i < ((count + 3) & ~3); ++i) {
+ temporal_mvs[i].mv32 = 0;
+ temporal_reference_offsets[i] = 0;
+ }
+ }
+ AddTemporalReferenceMvCandidate(
+ tile.frame_header(), reference_offsets, temporal_mvs,
+ temporal_reference_offsets, count, is_compound, zero_mv_context,
+ num_mv_found, &(*bp->prediction_parameters));
+ }
+}
+
+// Part of 7.10.2.13.
+void AddExtraCompoundMvCandidate(const Tile::Block& block, int mv_row,
+ int mv_column, int* const ref_id_count,
+ MotionVector ref_id[2][2],
+ int* const ref_diff_count,
+ MotionVector ref_diff[2][2]) {
+ const auto& bp = block.tile.Parameters(mv_row, mv_column);
+ const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+ block.tile.reference_frame_sign_bias();
+ for (int i = 0; i < 2; ++i) {
+ const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+ if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+ for (int j = 0; j < 2; ++j) {
+ MotionVector candidate_mv = bp.mv.mv[i];
+ const ReferenceFrameType block_reference_frame =
+ block.bp->reference_frame[j];
+ if (candidate_reference_frame == block_reference_frame &&
+ ref_id_count[j] < 2) {
+ ref_id[j][ref_id_count[j]] = candidate_mv;
+ ++ref_id_count[j];
+ } else if (ref_diff_count[j] < 2) {
+ if (reference_frame_sign_bias[candidate_reference_frame] !=
+ reference_frame_sign_bias[block_reference_frame]) {
+ candidate_mv.mv[0] *= -1;
+ candidate_mv.mv[1] *= -1;
+ }
+ ref_diff[j][ref_diff_count[j]] = candidate_mv;
+ ++ref_diff_count[j];
+ }
+ }
+ }
+}
+
+// Part of 7.10.2.13.
+void AddExtraSingleMvCandidate(const Tile::Block& block, int mv_row,
+ int mv_column, int* const num_mv_found) {
+ const auto& bp = block.tile.Parameters(mv_row, mv_column);
+ const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+ block.tile.reference_frame_sign_bias();
+ const ReferenceFrameType block_reference_frame = block.bp->reference_frame[0];
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+ int num_found = *num_mv_found;
+ for (int i = 0; i < 2; ++i) {
+ const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+ if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+ MotionVector candidate_mv = bp.mv.mv[i];
+ if (reference_frame_sign_bias[candidate_reference_frame] !=
+ reference_frame_sign_bias[block_reference_frame]) {
+ candidate_mv.mv[0] *= -1;
+ candidate_mv.mv[1] *= -1;
+ }
+ assert(num_found <= 2);
+ if ((num_found != 0 && ref_mv_stack[0] == candidate_mv) ||
+ (num_found == 2 && ref_mv_stack[1] == candidate_mv)) {
+ continue;
+ }
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, 0);
+ ++num_found;
+ }
+ *num_mv_found = num_found;
+}
+
+// 7.10.2.12.
+void ExtraSearch(const Tile::Block& block, bool is_compound,
+ int* const num_mv_found) {
+ const Tile& tile = block.tile;
+ const int num4x4 = std::min({static_cast<int>(block.width4x4),
+ tile.frame_header().columns4x4 - block.column4x4,
+ static_cast<int>(block.height4x4),
+ tile.frame_header().rows4x4 - block.row4x4, 16});
+ int ref_id_count[2] = {};
+ MotionVector ref_id[2][2] = {};
+ int ref_diff_count[2] = {};
+ MotionVector ref_diff[2][2] = {};
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ for (int pass = 0; pass < 2 && *num_mv_found < 2; ++pass) {
+ for (int i = 0; i < num4x4;) {
+ const int mv_row = block.row4x4 + ((pass == 0) ? -1 : i);
+ const int mv_column = block.column4x4 + ((pass == 0) ? i : -1);
+ if (!tile.IsTopLeftInside(mv_row + 1, mv_column + 1)) break;
+ if (is_compound) {
+ AddExtraCompoundMvCandidate(block, mv_row, mv_column, ref_id_count,
+ ref_id, ref_diff_count, ref_diff);
+ } else {
+ AddExtraSingleMvCandidate(block, mv_row, mv_column, num_mv_found);
+ if (*num_mv_found >= 2) break;
+ }
+ const auto& bp = tile.Parameters(mv_row, mv_column);
+ i +=
+ (pass == 0) ? kNum4x4BlocksWide[bp.size] : kNum4x4BlocksHigh[bp.size];
+ }
+ }
+ if (is_compound) {
+ // Merge compound mode extra search into mv stack.
+ CompoundMotionVector* const compound_ref_mv_stack =
+ prediction_parameters.compound_ref_mv_stack;
+ CompoundMotionVector combined_mvs[2] = {};
+ for (int i = 0; i < 2; ++i) {
+ int count = 0;
+ assert(ref_id_count[i] <= 2);
+ for (int j = 0; j < ref_id_count[i]; ++j, ++count) {
+ combined_mvs[count].mv[i] = ref_id[i][j];
+ }
+ for (int j = 0; j < ref_diff_count[i] && count < 2; ++j, ++count) {
+ combined_mvs[count].mv[i] = ref_diff[i][j];
+ }
+ for (; count < 2; ++count) {
+ combined_mvs[count].mv[i] = prediction_parameters.global_mv[i];
+ }
+ }
+ if (*num_mv_found == 1) {
+ if (combined_mvs[0] == compound_ref_mv_stack[0]) {
+ compound_ref_mv_stack[1] = combined_mvs[1];
+ } else {
+ compound_ref_mv_stack[1] = combined_mvs[0];
+ }
+ prediction_parameters.SetWeightIndexStackEntry(1, 0);
+ } else {
+ assert(*num_mv_found == 0);
+ for (int i = 0; i < 2; ++i) {
+ compound_ref_mv_stack[i] = combined_mvs[i];
+ prediction_parameters.SetWeightIndexStackEntry(i, 0);
+ }
+ }
+ *num_mv_found = 2;
+ } else {
+ // single prediction mode
+ MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+ for (int i = *num_mv_found; i < 2; ++i) {
+ ref_mv_stack[i] = prediction_parameters.global_mv[0];
+ prediction_parameters.SetWeightIndexStackEntry(i, 0);
+ }
+ }
+}
+
+void DescendingOrderTwo(int* const a, int* const b) {
+ if (*a < *b) {
+ std::swap(*a, *b);
+ }
+}
+
+// Comparator used for sorting candidate motion vectors in descending order of
+// their weights (as specified in 7.10.2.11).
+bool CompareCandidateMotionVectors(const int16_t& lhs, const int16_t& rhs) {
+ return lhs > rhs;
+}
+
+void SortWeightIndexStack(const int size, const int sort_to_n,
+ int16_t* const weight_index_stack) {
+ if (size <= 1) return;
+ if (size <= 3) {
+ // Specialize small sort sizes to speed up.
+ int weight_index_0 = weight_index_stack[0];
+ int weight_index_1 = weight_index_stack[1];
+ DescendingOrderTwo(&weight_index_0, &weight_index_1);
+ if (size == 3) {
+ int weight_index_2 = weight_index_stack[2];
+ DescendingOrderTwo(&weight_index_1, &weight_index_2);
+ DescendingOrderTwo(&weight_index_0, &weight_index_1);
+ weight_index_stack[2] = weight_index_2;
+ }
+ weight_index_stack[0] = weight_index_0;
+ weight_index_stack[1] = weight_index_1;
+ return;
+ }
+ if (sort_to_n == 1) {
+ // std::max_element() is not efficient. Find the max element in a loop.
+ int16_t max_element = weight_index_stack[0];
+ int i = 1;
+ do {
+ max_element = std::max(max_element, weight_index_stack[i]);
+ } while (++i < size);
+ weight_index_stack[0] = max_element;
+ return;
+ }
+ std::partial_sort(&weight_index_stack[0], &weight_index_stack[sort_to_n],
+ &weight_index_stack[size], CompareCandidateMotionVectors);
+}
+
+// 7.10.2.14 (part 2).
+void ComputeContexts(bool found_new_mv, int nearest_matches, int total_matches,
+ int* new_mv_context, int* reference_mv_context) {
+ switch (nearest_matches) {
+ case 0:
+ *new_mv_context = std::min(total_matches, 1);
+ *reference_mv_context = total_matches;
+ break;
+ case 1:
+ *new_mv_context = 3 - static_cast<int>(found_new_mv);
+ *reference_mv_context = 2 + total_matches;
+ break;
+ default:
+ *new_mv_context = 5 - static_cast<int>(found_new_mv);
+ *reference_mv_context = 5;
+ break;
+ }
+}
+
+// 7.10.4.2.
+void AddSample(const Tile::Block& block, int delta_row, int delta_column,
+ int* const num_warp_samples, int* const num_samples_scanned,
+ int candidates[kMaxLeastSquaresSamples][4]) {
+ if (*num_samples_scanned >= kMaxLeastSquaresSamples) return;
+ const int mv_row = block.row4x4 + delta_row;
+ const int mv_column = block.column4x4 + delta_column;
+ const Tile& tile = block.tile;
+ if (!tile.IsInside(mv_row, mv_column) ||
+ !tile.HasParameters(mv_row, mv_column)) {
+ return;
+ }
+ const BlockParameters& bp = *block.bp;
+ const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+ if (mv_bp.reference_frame[0] != bp.reference_frame[0] ||
+ mv_bp.reference_frame[1] != kReferenceFrameNone) {
+ return;
+ }
+ ++*num_samples_scanned;
+ const int candidate_height4x4 = kNum4x4BlocksHigh[mv_bp.size];
+ const int candidate_row = mv_row & ~(candidate_height4x4 - 1);
+ const int candidate_width4x4 = kNum4x4BlocksWide[mv_bp.size];
+ const int candidate_column = mv_column & ~(candidate_width4x4 - 1);
+ const BlockParameters& candidate_bp =
+ tile.Parameters(candidate_row, candidate_column);
+ const int mv_diff_row =
+ std::abs(candidate_bp.mv.mv[0].mv[0] - bp.mv.mv[0].mv[0]);
+ const int mv_diff_column =
+ std::abs(candidate_bp.mv.mv[0].mv[1] - bp.mv.mv[0].mv[1]);
+ const bool is_valid =
+ mv_diff_row + mv_diff_column <= kWarpValidThreshold[block.size];
+ if (!is_valid && *num_samples_scanned > 1) {
+ return;
+ }
+ const int mid_y =
+ MultiplyBy4(candidate_row) + MultiplyBy2(candidate_height4x4) - 1;
+ const int mid_x =
+ MultiplyBy4(candidate_column) + MultiplyBy2(candidate_width4x4) - 1;
+ candidates[*num_warp_samples][0] = MultiplyBy8(mid_y);
+ candidates[*num_warp_samples][1] = MultiplyBy8(mid_x);
+ candidates[*num_warp_samples][2] =
+ MultiplyBy8(mid_y) + candidate_bp.mv.mv[0].mv[0];
+ candidates[*num_warp_samples][3] =
+ MultiplyBy8(mid_x) + candidate_bp.mv.mv[0].mv[1];
+ if (is_valid) ++*num_warp_samples;
+}
+
+// 7.9.2.
+// In the spec, |dst_sign| is either 1 or -1. Here we set |dst_sign| to either 0
+// or -1 so that it can be XORed and subtracted directly in ApplySign() and
+// corresponding SIMD implementations.
+bool MotionFieldProjection(
+ const ObuFrameHeader& frame_header,
+ const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+ reference_frames,
+ ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign,
+ int y8_start, int y8_end, int x8_start, int x8_end,
+ TemporalMotionField* const motion_field) {
+ const int source_index =
+ frame_header.reference_frame_index[source - kReferenceFrameLast];
+ auto* const source_frame = reference_frames[source_index].get();
+ assert(source_frame != nullptr);
+ assert(dst_sign == 0 || dst_sign == -1);
+ if (source_frame->rows4x4() != frame_header.rows4x4 ||
+ source_frame->columns4x4() != frame_header.columns4x4 ||
+ IsIntraFrame(source_frame->frame_type())) {
+ return false;
+ }
+ assert(reference_to_current_with_sign >= -kMaxFrameDistance);
+ if (reference_to_current_with_sign > kMaxFrameDistance) return true;
+ const ReferenceInfo& reference_info = *source_frame->reference_info();
+ const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+ dsp.motion_field_projection_kernel(
+ reference_info, reference_to_current_with_sign, dst_sign, y8_start,
+ y8_end, x8_start, x8_end, motion_field);
+ return true;
+}
+
+} // namespace
+
+void FindMvStack(const Tile::Block& block, bool is_compound,
+ MvContexts* const contexts) {
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ SetupGlobalMv(block, 0, &prediction_parameters.global_mv[0]);
+ if (is_compound) SetupGlobalMv(block, 1, &prediction_parameters.global_mv[1]);
+ bool found_new_mv = false;
+ bool found_row_match = false;
+ int num_mv_found = 0;
+ ScanRow(block, block.column4x4, -1, is_compound, &found_new_mv,
+ &found_row_match, &num_mv_found);
+ bool found_column_match = false;
+ ScanColumn(block, block.row4x4, -1, is_compound, &found_new_mv,
+ &found_column_match, &num_mv_found);
+ if (std::max(block.width4x4, block.height4x4) <= 16) {
+ ScanPoint(block, -1, block.width4x4, is_compound, &found_new_mv,
+ &found_row_match, &num_mv_found);
+ }
+ const int nearest_matches =
+ static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+ prediction_parameters.nearest_mv_count = num_mv_found;
+ if (block.tile.frame_header().use_ref_frame_mvs) {
+ // Initialize to invalid value, and it will be set when temporal mv is zero.
+ contexts->zero_mv = -1;
+ TemporalScan(block, is_compound, &contexts->zero_mv, &num_mv_found);
+ } else {
+ contexts->zero_mv = 0;
+ }
+ bool dummy_bool = false;
+ ScanPoint(block, -1, -1, is_compound, &dummy_bool, &found_row_match,
+ &num_mv_found);
+ static constexpr int deltas[2] = {-3, -5};
+ for (int i = 0; i < 2; ++i) {
+ if (i == 0 || block.height4x4 > 1) {
+ ScanRow(block, block.column4x4 | 1, deltas[i] + (block.row4x4 & 1),
+ is_compound, &dummy_bool, &found_row_match, &num_mv_found);
+ }
+ if (i == 0 || block.width4x4 > 1) {
+ ScanColumn(block, block.row4x4 | 1, deltas[i] + (block.column4x4 & 1),
+ is_compound, &dummy_bool, &found_column_match, &num_mv_found);
+ }
+ }
+ if (num_mv_found < 2) {
+ ExtraSearch(block, is_compound, &num_mv_found);
+ } else {
+ // The sort of |weight_index_stack| could be moved to Tile::AssignIntraMv()
+ // and Tile::AssignInterMv(), and only do a partial sort to the max index we
+ // need. However, the speed gain is trivial.
+ // For intra case, only the first 1 or 2 mvs in the stack will be used.
+ // For inter case, |prediction_parameters.ref_mv_index| is at most 3.
+ // We only need to do the partial sort up to the first 4 mvs.
+ SortWeightIndexStack(prediction_parameters.nearest_mv_count, 4,
+ prediction_parameters.weight_index_stack);
+ // When there are 4 or more nearest mvs, the other mvs will not be used.
+ if (prediction_parameters.nearest_mv_count < 4) {
+ SortWeightIndexStack(
+ num_mv_found - prediction_parameters.nearest_mv_count,
+ 4 - prediction_parameters.nearest_mv_count,
+ prediction_parameters.weight_index_stack +
+ prediction_parameters.nearest_mv_count);
+ }
+ }
+ prediction_parameters.ref_mv_count = num_mv_found;
+ const int total_matches =
+ static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+ ComputeContexts(found_new_mv, nearest_matches, total_matches,
+ &contexts->new_mv, &contexts->reference_mv);
+ // The mv stack clamping process is in Tile::AssignIntraMv() and
+ // Tile::AssignInterMv(), and only up to two mvs are clamped.
+}
+
+void FindWarpSamples(const Tile::Block& block, int* const num_warp_samples,
+ int* const num_samples_scanned,
+ int candidates[kMaxLeastSquaresSamples][4]) {
+ const Tile& tile = block.tile;
+ bool top_left = true;
+ bool top_right = true;
+ int step = 1;
+ if (block.top_available[kPlaneY]) {
+ BlockSize source_size =
+ tile.Parameters(block.row4x4 - 1, block.column4x4).size;
+ const int source_width4x4 = kNum4x4BlocksWide[source_size];
+ if (block.width4x4 <= source_width4x4) {
+ // The & here is equivalent to % since source_width4x4 is a power of two.
+ const int column_offset = -(block.column4x4 & (source_width4x4 - 1));
+ if (column_offset < 0) top_left = false;
+ if (column_offset + source_width4x4 > block.width4x4) top_right = false;
+ AddSample(block, -1, 0, num_warp_samples, num_samples_scanned,
+ candidates);
+ } else {
+ for (int i = 0;
+ i < std::min(static_cast<int>(block.width4x4),
+ tile.frame_header().columns4x4 - block.column4x4);
+ i += step) {
+ source_size =
+ tile.Parameters(block.row4x4 - 1, block.column4x4 + i).size;
+ step = std::min(static_cast<int>(block.width4x4),
+ static_cast<int>(kNum4x4BlocksWide[source_size]));
+ AddSample(block, -1, i, num_warp_samples, num_samples_scanned,
+ candidates);
+ }
+ }
+ }
+ if (block.left_available[kPlaneY]) {
+ BlockSize source_size =
+ tile.Parameters(block.row4x4, block.column4x4 - 1).size;
+ const int source_height4x4 = kNum4x4BlocksHigh[source_size];
+ if (block.height4x4 <= source_height4x4) {
+ const int row_offset = -(block.row4x4 & (source_height4x4 - 1));
+ if (row_offset < 0) top_left = false;
+ AddSample(block, 0, -1, num_warp_samples, num_samples_scanned,
+ candidates);
+ } else {
+ for (int i = 0; i < std::min(static_cast<int>(block.height4x4),
+ tile.frame_header().rows4x4 - block.row4x4);
+ i += step) {
+ source_size =
+ tile.Parameters(block.row4x4 + i, block.column4x4 - 1).size;
+ step = std::min(static_cast<int>(block.height4x4),
+ static_cast<int>(kNum4x4BlocksHigh[source_size]));
+ AddSample(block, i, -1, num_warp_samples, num_samples_scanned,
+ candidates);
+ }
+ }
+ }
+ if (top_left) {
+ AddSample(block, -1, -1, num_warp_samples, num_samples_scanned, candidates);
+ }
+ if (top_right && block.size <= kBlock64x64) {
+ AddSample(block, -1, block.width4x4, num_warp_samples, num_samples_scanned,
+ candidates);
+ }
+ if (*num_warp_samples == 0 && *num_samples_scanned > 0) *num_warp_samples = 1;
+}
+
+void SetupMotionField(
+ const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+ const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+ reference_frames,
+ int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+ TemporalMotionField* const motion_field) {
+ assert(frame_header.use_ref_frame_mvs);
+ const int y8_start = DivideBy2(row4x4_start);
+ const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4));
+ const int x8_start = DivideBy2(column4x4_start);
+ const int x8_end =
+ DivideBy2(std::min(column4x4_end, frame_header.columns4x4));
+ const int last_index = frame_header.reference_frame_index[0];
+ const ReferenceInfo& reference_info = *current_frame.reference_info();
+ if (!IsIntraFrame(reference_frames[last_index]->frame_type())) {
+ const int last_alternate_order_hint =
+ reference_frames[last_index]
+ ->reference_info()
+ ->order_hint[kReferenceFrameAlternate];
+ const int current_gold_order_hint =
+ reference_info.order_hint[kReferenceFrameGolden];
+ if (last_alternate_order_hint != current_gold_order_hint) {
+ const int reference_offset_last =
+ -reference_info.relative_distance_from[kReferenceFrameLast];
+ if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameLast, reference_offset_last, -1,
+ y8_start, y8_end, x8_start, x8_end, motion_field);
+ }
+ }
+ }
+ int ref_stamp = 1;
+ const int reference_offset_backward =
+ reference_info.relative_distance_from[kReferenceFrameBackward];
+ if (reference_offset_backward > 0 &&
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameBackward, reference_offset_backward,
+ 0, y8_start, y8_end, x8_start, x8_end,
+ motion_field)) {
+ --ref_stamp;
+ }
+ const int reference_offset_alternate2 =
+ reference_info.relative_distance_from[kReferenceFrameAlternate2];
+ if (reference_offset_alternate2 > 0 &&
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameAlternate2,
+ reference_offset_alternate2, 0, y8_start, y8_end,
+ x8_start, x8_end, motion_field)) {
+ --ref_stamp;
+ }
+ if (ref_stamp >= 0) {
+ const int reference_offset_alternate =
+ reference_info.relative_distance_from[kReferenceFrameAlternate];
+ if (reference_offset_alternate > 0 &&
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameAlternate,
+ reference_offset_alternate, 0, y8_start, y8_end,
+ x8_start, x8_end, motion_field)) {
+ --ref_stamp;
+ }
+ }
+ if (ref_stamp >= 0) {
+ const int reference_offset_last2 =
+ -reference_info.relative_distance_from[kReferenceFrameLast2];
+ if (std::abs(reference_offset_last2) <= kMaxFrameDistance) {
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameLast2, reference_offset_last2, -1,
+ y8_start, y8_end, x8_start, x8_end, motion_field);
+ }
+ }
+}
+
+} // namespace libgav1
diff --git a/src/motion_vector.h b/src/motion_vector.h
new file mode 100644
index 0000000..d739e80
--- /dev/null
+++ b/src/motion_vector.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_MOTION_VECTOR_H_
+#define LIBGAV1_SRC_MOTION_VECTOR_H_
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/obu_parser.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr bool IsGlobalMvBlock(bool is_global_mv_block,
+ GlobalMotionTransformationType type) {
+ return is_global_mv_block &&
+ type > kGlobalMotionTransformationTypeTranslation;
+}
+
+// The |contexts| output parameter may be null. If the caller does not need
+// the |contexts| output, pass nullptr as the argument.
+void FindMvStack(const Tile::Block& block, bool is_compound,
+ MvContexts* contexts); // 7.10.2
+
+void FindWarpSamples(const Tile::Block& block, int* num_warp_samples,
+ int* num_samples_scanned,
+ int candidates[kMaxLeastSquaresSamples][4]); // 7.10.4.
+
+// Section 7.9.1 in the spec. But this is done per tile instead of for the whole
+// frame.
+void SetupMotionField(
+ const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+ const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+ reference_frames,
+ int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+ TemporalMotionField* motion_field);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_MOTION_VECTOR_H_
diff --git a/src/obu_parser.cc b/src/obu_parser.cc
new file mode 100644
index 0000000..bbf00ed
--- /dev/null
+++ b/src/obu_parser.cc
@@ -0,0 +1,2885 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/obu_parser.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/motion_vector.h"
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// 5.9.16.
+// Find the smallest value of k such that block_size << k is greater than or
+// equal to target.
+//
+// NOTE: TileLog2(block_size, target) is equal to
+// CeilLog2(ceil((double)target / block_size))
+// where the division is a floating-point number division. (This equality holds
+// even when |target| is equal to 0.) In the special case of block_size == 1,
+// TileLog2(1, target) is equal to CeilLog2(target).
+int TileLog2(int block_size, int target) {
+ int k = 0;
+ for (; (block_size << k) < target; ++k) {
+ }
+ return k;
+}
+
+void ParseBitStreamLevel(BitStreamLevel* const level, uint8_t level_bits) {
+ level->major = kMinimumMajorBitstreamLevel + (level_bits >> 2);
+ level->minor = level_bits & 3;
+}
+
+// This function assumes loop_filter is zero-initialized, so only it needs to
+// set the nonzero default values.
+void SetDefaultRefDeltas(LoopFilter* const loop_filter) {
+ loop_filter->ref_deltas[kReferenceFrameIntra] = 1;
+ loop_filter->ref_deltas[kReferenceFrameGolden] = -1;
+ loop_filter->ref_deltas[kReferenceFrameAlternate] = -1;
+ loop_filter->ref_deltas[kReferenceFrameAlternate2] = -1;
+}
+
+bool InTemporalLayer(int operating_point_idc, int temporal_id) {
+ return ((operating_point_idc >> temporal_id) & 1) != 0;
+}
+
+bool InSpatialLayer(int operating_point_idc, int spatial_id) {
+ return ((operating_point_idc >> (spatial_id + 8)) & 1) != 0;
+}
+
+// Returns the index of the last nonzero byte in the |data| buffer of |size|
+// bytes. If there is no nonzero byte in the |data| buffer, returns -1.
+int GetLastNonzeroByteIndex(const uint8_t* data, size_t size) {
+ // Scan backward for a nonzero byte.
+ if (size > INT_MAX) return -1;
+ int i = static_cast<int>(size) - 1;
+ while (i >= 0 && data[i] == 0) {
+ --i;
+ }
+ return i;
+}
+
+// A cleanup helper class that releases the frame buffer reference held in
+// |frame| in the destructor.
+class RefCountedBufferPtrCleanup {
+ public:
+ explicit RefCountedBufferPtrCleanup(RefCountedBufferPtr* frame)
+ : frame_(*frame) {}
+
+ // Not copyable or movable.
+ RefCountedBufferPtrCleanup(const RefCountedBufferPtrCleanup&) = delete;
+ RefCountedBufferPtrCleanup& operator=(const RefCountedBufferPtrCleanup&) =
+ delete;
+
+ ~RefCountedBufferPtrCleanup() { frame_ = nullptr; }
+
+ private:
+ RefCountedBufferPtr& frame_;
+};
+
+} // namespace
+
+bool ObuSequenceHeader::ParametersChanged(const ObuSequenceHeader& old) const {
+ // Note that the operating_parameters field is not compared per Section 7.5:
+ // Within a particular coded video sequence, the contents of
+ // sequence_header_obu must be bit-identical each time the sequence header
+ // appears except for the contents of operating_parameters_info.
+ return memcmp(this, &old,
+ offsetof(ObuSequenceHeader, operating_parameters)) != 0;
+}
+
+// Macros to avoid repeated error checks in the parser code.
+#define OBU_LOG_AND_RETURN_FALSE \
+ do { \
+ LIBGAV1_DLOG(ERROR, "%s:%d (%s): Not enough bits.", __FILE__, __LINE__, \
+ __func__); \
+ return false; \
+ } while (false)
+#define OBU_PARSER_FAIL \
+ do { \
+ if (scratch == -1) { \
+ OBU_LOG_AND_RETURN_FALSE; \
+ } \
+ } while (false)
+#define OBU_READ_BIT_OR_FAIL \
+ scratch = bit_reader_->ReadBit(); \
+ OBU_PARSER_FAIL
+#define OBU_READ_LITERAL_OR_FAIL(n) \
+ scratch = bit_reader_->ReadLiteral(n); \
+ OBU_PARSER_FAIL
+#define OBU_READ_UVLC_OR_FAIL(x) \
+ do { \
+ if (!bit_reader_->ReadUvlc(&(x))) { \
+ OBU_LOG_AND_RETURN_FALSE; \
+ } \
+ } while (false)
+
+bool ObuParser::ParseColorConfig(ObuSequenceHeader* sequence_header) {
+ int64_t scratch;
+ ColorConfig* const color_config = &sequence_header->color_config;
+ OBU_READ_BIT_OR_FAIL;
+ const auto high_bitdepth = static_cast<bool>(scratch);
+ if (sequence_header->profile == kProfile2 && high_bitdepth) {
+ OBU_READ_BIT_OR_FAIL;
+ const auto is_twelve_bit = static_cast<bool>(scratch);
+ color_config->bitdepth = is_twelve_bit ? 12 : 10;
+ } else {
+ color_config->bitdepth = high_bitdepth ? 10 : 8;
+ }
+ if (sequence_header->profile == kProfile1) {
+ color_config->is_monochrome = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->is_monochrome = static_cast<bool>(scratch);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ const auto color_description_present_flag = static_cast<bool>(scratch);
+ if (color_description_present_flag) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ color_config->color_primary = static_cast<ColorPrimary>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(8);
+ color_config->transfer_characteristics =
+ static_cast<TransferCharacteristics>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(8);
+ color_config->matrix_coefficients =
+ static_cast<MatrixCoefficients>(scratch);
+ } else {
+ color_config->color_primary = kColorPrimaryUnspecified;
+ color_config->transfer_characteristics =
+ kTransferCharacteristicsUnspecified;
+ color_config->matrix_coefficients = kMatrixCoefficientsUnspecified;
+ }
+ if (color_config->is_monochrome) {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->color_range = static_cast<ColorRange>(scratch);
+ // Set subsampling_x and subsampling_y to 1 for monochrome. This makes it
+ // easy to allow monochrome to be supported in profile 0. Profile 0
+ // requires subsampling_x and subsampling_y to be 1.
+ color_config->subsampling_x = 1;
+ color_config->subsampling_y = 1;
+ color_config->chroma_sample_position = kChromaSamplePositionUnknown;
+ } else {
+ if (color_config->color_primary == kColorPrimaryBt709 &&
+ color_config->transfer_characteristics ==
+ kTransferCharacteristicsSrgb &&
+ color_config->matrix_coefficients == kMatrixCoefficientsIdentity) {
+ color_config->color_range = kColorRangeFull;
+ color_config->subsampling_x = 0;
+ color_config->subsampling_y = 0;
+ // YUV 4:4:4 is only allowed in profile 1, or profile 2 with bit depth 12.
+ // See the table at the beginning of Section 6.4.1.
+ if (sequence_header->profile != kProfile1 &&
+ (sequence_header->profile != kProfile2 ||
+ color_config->bitdepth != 12)) {
+ LIBGAV1_DLOG(ERROR,
+ "YUV 4:4:4 is not allowed in profile %d for bitdepth %d.",
+ sequence_header->profile, color_config->bitdepth);
+ return false;
+ }
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->color_range = static_cast<ColorRange>(scratch);
+ if (sequence_header->profile == kProfile0) {
+ color_config->subsampling_x = 1;
+ color_config->subsampling_y = 1;
+ } else if (sequence_header->profile == kProfile1) {
+ color_config->subsampling_x = 0;
+ color_config->subsampling_y = 0;
+ } else {
+ if (color_config->bitdepth == 12) {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->subsampling_x = scratch;
+ if (color_config->subsampling_x == 1) {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->subsampling_y = scratch;
+ } else {
+ color_config->subsampling_y = 0;
+ }
+ } else {
+ color_config->subsampling_x = 1;
+ color_config->subsampling_y = 0;
+ }
+ }
+ if (color_config->subsampling_x == 1 &&
+ color_config->subsampling_y == 1) {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ color_config->chroma_sample_position =
+ static_cast<ChromaSamplePosition>(scratch);
+ }
+ }
+ OBU_READ_BIT_OR_FAIL;
+ color_config->separate_uv_delta_q = static_cast<bool>(scratch);
+ }
+ if (color_config->matrix_coefficients == kMatrixCoefficientsIdentity &&
+ (color_config->subsampling_x != 0 || color_config->subsampling_y != 0)) {
+ LIBGAV1_DLOG(ERROR,
+ "matrix_coefficients is MC_IDENTITY, but subsampling_x (%d) "
+ "and subsampling_y (%d) are not both 0.",
+ color_config->subsampling_x, color_config->subsampling_y);
+ return false;
+ }
+ return true;
+}
+
+bool ObuParser::ParseTimingInfo(ObuSequenceHeader* sequence_header) {
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header->timing_info_present_flag = static_cast<bool>(scratch);
+ if (!sequence_header->timing_info_present_flag) return true;
+ TimingInfo* const info = &sequence_header->timing_info;
+ OBU_READ_LITERAL_OR_FAIL(32);
+ info->num_units_in_tick = static_cast<uint32_t>(scratch);
+ if (info->num_units_in_tick == 0) {
+ LIBGAV1_DLOG(ERROR, "num_units_in_tick is 0.");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(32);
+ info->time_scale = static_cast<uint32_t>(scratch);
+ if (info->time_scale == 0) {
+ LIBGAV1_DLOG(ERROR, "time_scale is 0.");
+ return false;
+ }
+ OBU_READ_BIT_OR_FAIL;
+ info->equal_picture_interval = static_cast<bool>(scratch);
+ if (info->equal_picture_interval) {
+ OBU_READ_UVLC_OR_FAIL(info->num_ticks_per_picture);
+ ++info->num_ticks_per_picture;
+ }
+ return true;
+}
+
+bool ObuParser::ParseDecoderModelInfo(ObuSequenceHeader* sequence_header) {
+ if (!sequence_header->timing_info_present_flag) return true;
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header->decoder_model_info_present_flag = static_cast<bool>(scratch);
+ if (!sequence_header->decoder_model_info_present_flag) return true;
+ DecoderModelInfo* const info = &sequence_header->decoder_model_info;
+ OBU_READ_LITERAL_OR_FAIL(5);
+ info->encoder_decoder_buffer_delay_length = 1 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(32);
+ info->num_units_in_decoding_tick = static_cast<uint32_t>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(5);
+ info->buffer_removal_time_length = 1 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(5);
+ info->frame_presentation_time_length = 1 + scratch;
+ return true;
+}
+
+bool ObuParser::ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+ int index) {
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header->decoder_model_present_for_operating_point[index] =
+ static_cast<bool>(scratch);
+ if (!sequence_header->decoder_model_present_for_operating_point[index]) {
+ return true;
+ }
+ OperatingParameters* const params = &sequence_header->operating_parameters;
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+ params->decoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+ params->encoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ params->low_delay_mode_flag[index] = static_cast<bool>(scratch);
+ return true;
+}
+
+bool ObuParser::ParseSequenceHeader(bool seen_frame_header) {
+ ObuSequenceHeader sequence_header = {};
+ int64_t scratch;
+ OBU_READ_LITERAL_OR_FAIL(3);
+ if (scratch >= kMaxProfiles) {
+ LIBGAV1_DLOG(ERROR, "Invalid profile: %d.", static_cast<int>(scratch));
+ return false;
+ }
+ sequence_header.profile = static_cast<BitstreamProfile>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.still_picture = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.reduced_still_picture_header = static_cast<bool>(scratch);
+ if (sequence_header.reduced_still_picture_header) {
+ if (!sequence_header.still_picture) {
+ LIBGAV1_DLOG(
+ ERROR, "reduced_still_picture_header is 1, but still_picture is 0.");
+ return false;
+ }
+ sequence_header.operating_points = 1;
+ sequence_header.operating_point_idc[0] = 0;
+ OBU_READ_LITERAL_OR_FAIL(5);
+ ParseBitStreamLevel(&sequence_header.level[0], scratch);
+ } else {
+ if (!ParseTimingInfo(&sequence_header) ||
+ !ParseDecoderModelInfo(&sequence_header)) {
+ return false;
+ }
+ OBU_READ_BIT_OR_FAIL;
+ const auto initial_display_delay_present_flag = static_cast<bool>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(5);
+ sequence_header.operating_points = static_cast<int>(1 + scratch);
+ if (operating_point_ >= sequence_header.operating_points) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Invalid operating point: %d (valid range is [0,%d] inclusive).",
+ operating_point_, sequence_header.operating_points - 1);
+ return false;
+ }
+ for (int i = 0; i < sequence_header.operating_points; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(12);
+ sequence_header.operating_point_idc[i] = static_cast<int>(scratch);
+ for (int j = 0; j < i; ++j) {
+ if (sequence_header.operating_point_idc[i] ==
+ sequence_header.operating_point_idc[j]) {
+ LIBGAV1_DLOG(ERROR,
+ "operating_point_idc[%d] (%d) is equal to "
+ "operating_point_idc[%d] (%d).",
+ i, sequence_header.operating_point_idc[i], j,
+ sequence_header.operating_point_idc[j]);
+ return false;
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(5);
+ ParseBitStreamLevel(&sequence_header.level[i], scratch);
+ if (sequence_header.level[i].major > 3) {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.tier[i] = scratch;
+ }
+ if (sequence_header.decoder_model_info_present_flag &&
+ !ParseOperatingParameters(&sequence_header, i)) {
+ return false;
+ }
+ if (initial_display_delay_present_flag) {
+ OBU_READ_BIT_OR_FAIL;
+ if (static_cast<bool>(scratch)) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ sequence_header.initial_display_delay[i] = 1 + scratch;
+ }
+ }
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(4);
+ sequence_header.frame_width_bits = 1 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(4);
+ sequence_header.frame_height_bits = 1 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_width_bits);
+ sequence_header.max_frame_width = static_cast<int32_t>(1 + scratch);
+ OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_height_bits);
+ sequence_header.max_frame_height = static_cast<int32_t>(1 + scratch);
+ if (!sequence_header.reduced_still_picture_header) {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.frame_id_numbers_present = static_cast<bool>(scratch);
+ }
+ if (sequence_header.frame_id_numbers_present) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ sequence_header.delta_frame_id_length_bits = 2 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(3);
+ sequence_header.frame_id_length_bits =
+ sequence_header.delta_frame_id_length_bits + 1 + scratch;
+ // Section 6.8.2: It is a requirement of bitstream conformance that the
+ // number of bits needed to read display_frame_id does not exceed 16. This
+ // is equivalent to the constraint that idLen <= 16.
+ if (sequence_header.frame_id_length_bits > 16) {
+ LIBGAV1_DLOG(ERROR, "Invalid frame_id_length_bits: %d.",
+ sequence_header.frame_id_length_bits);
+ return false;
+ }
+ }
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.use_128x128_superblock = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_filter_intra = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_intra_edge_filter = static_cast<bool>(scratch);
+ if (sequence_header.reduced_still_picture_header) {
+ sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+ sequence_header.force_integer_mv = kSelectIntegerMv;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_interintra_compound = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_masked_compound = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_warped_motion = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_dual_filter = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_order_hint = static_cast<bool>(scratch);
+ if (sequence_header.enable_order_hint) {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_jnt_comp = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_ref_frame_mvs = static_cast<bool>(scratch);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.choose_screen_content_tools = static_cast<bool>(scratch);
+ if (sequence_header.choose_screen_content_tools) {
+ sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.force_screen_content_tools = scratch;
+ }
+ if (sequence_header.force_screen_content_tools > 0) {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.choose_integer_mv = static_cast<bool>(scratch);
+ if (sequence_header.choose_integer_mv) {
+ sequence_header.force_integer_mv = kSelectIntegerMv;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.force_integer_mv = scratch;
+ }
+ } else {
+ sequence_header.force_integer_mv = kSelectIntegerMv;
+ }
+ if (sequence_header.enable_order_hint) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ sequence_header.order_hint_bits = 1 + scratch;
+ sequence_header.order_hint_shift_bits =
+ Mod32(32 - sequence_header.order_hint_bits);
+ }
+ }
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_superres = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_cdef = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_restoration = static_cast<bool>(scratch);
+ if (!ParseColorConfig(&sequence_header)) return false;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.film_grain_params_present = static_cast<bool>(scratch);
+ // Compare new sequence header with old sequence header.
+ if (has_sequence_header_ &&
+ sequence_header.ParametersChanged(sequence_header_)) {
+ // Between the frame header OBU and the last tile group OBU of the frame,
+ // do not allow the sequence header to change.
+ if (seen_frame_header) {
+ LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
+ return false;
+ }
+ decoder_state_.ClearReferenceFrames();
+ }
+ sequence_header_ = sequence_header;
+ has_sequence_header_ = true;
+ // Section 6.4.1: It is a requirement of bitstream conformance that if
+ // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
+ // all OBUs that follow this sequence header until the next sequence header.
+ extension_disallowed_ =
+ (sequence_header_.operating_point_idc[operating_point_] == 0);
+ return true;
+}
+
+// Marks reference frames as invalid for referencing when they are too far in
+// the past to be referenced by the frame id mechanism.
+void ObuParser::MarkInvalidReferenceFrames() {
+ // The current lower bound of the frame ids for reference frames.
+ int lower_bound = decoder_state_.current_frame_id -
+ (1 << sequence_header_.delta_frame_id_length_bits);
+ // True if lower_bound is smaller than current_frame_id. False if lower_bound
+ // wraps around (in modular arithmetic) to the other side of current_frame_id.
+ bool lower_bound_is_smaller = true;
+ if (lower_bound <= 0) {
+ lower_bound += 1 << sequence_header_.frame_id_length_bits;
+ lower_bound_is_smaller = false;
+ }
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const uint16_t reference_frame_id = decoder_state_.reference_frame_id[i];
+ if (lower_bound_is_smaller) {
+ if (reference_frame_id > decoder_state_.current_frame_id ||
+ reference_frame_id < lower_bound) {
+ decoder_state_.reference_valid[i] = false;
+ }
+ } else {
+ if (reference_frame_id > decoder_state_.current_frame_id &&
+ reference_frame_id < lower_bound) {
+ decoder_state_.reference_valid[i] = false;
+ }
+ }
+ }
+}
+
+bool ObuParser::ParseFrameSizeAndRenderSize() {
+ int64_t scratch;
+ // Frame Size.
+ if (frame_header_.frame_size_override_flag) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_width_bits);
+ frame_header_.width = static_cast<int32_t>(1 + scratch);
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_height_bits);
+ frame_header_.height = static_cast<int32_t>(1 + scratch);
+ if (frame_header_.width > sequence_header_.max_frame_width ||
+ frame_header_.height > sequence_header_.max_frame_height) {
+ LIBGAV1_DLOG(ERROR,
+ "Frame dimensions are larger than the maximum values");
+ return false;
+ }
+ } else {
+ frame_header_.width = sequence_header_.max_frame_width;
+ frame_header_.height = sequence_header_.max_frame_height;
+ }
+ if (!ParseSuperResParametersAndComputeImageSize()) return false;
+
+ // Render Size.
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.render_and_frame_size_different = static_cast<bool>(scratch);
+ if (frame_header_.render_and_frame_size_different) {
+ OBU_READ_LITERAL_OR_FAIL(16);
+ frame_header_.render_width = static_cast<int32_t>(1 + scratch);
+ OBU_READ_LITERAL_OR_FAIL(16);
+ frame_header_.render_height = static_cast<int32_t>(1 + scratch);
+ } else {
+ frame_header_.render_width = frame_header_.upscaled_width;
+ frame_header_.render_height = frame_header_.height;
+ }
+
+ return true;
+}
+
+bool ObuParser::ParseSuperResParametersAndComputeImageSize() {
+ int64_t scratch;
+ // SuperRes.
+ frame_header_.upscaled_width = frame_header_.width;
+ frame_header_.use_superres = false;
+ if (sequence_header_.enable_superres) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.use_superres = static_cast<bool>(scratch);
+ }
+ if (frame_header_.use_superres) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ // 9 is the smallest value for the denominator.
+ frame_header_.superres_scale_denominator = scratch + 9;
+ frame_header_.width =
+ (frame_header_.upscaled_width * kSuperResScaleNumerator +
+ (frame_header_.superres_scale_denominator / 2)) /
+ frame_header_.superres_scale_denominator;
+ } else {
+ frame_header_.superres_scale_denominator = kSuperResScaleNumerator;
+ }
+ assert(frame_header_.width != 0);
+ assert(frame_header_.height != 0);
+ // Check if multiplying upscaled_width by height would overflow.
+ assert(frame_header_.upscaled_width >= frame_header_.width);
+ if (frame_header_.upscaled_width > INT32_MAX / frame_header_.height) {
+ LIBGAV1_DLOG(ERROR, "Frame dimensions too big: width=%d height=%d.",
+ frame_header_.width, frame_header_.height);
+ return false;
+ }
+ frame_header_.columns4x4 = ((frame_header_.width + 7) >> 3) << 1;
+ frame_header_.rows4x4 = ((frame_header_.height + 7) >> 3) << 1;
+ return true;
+}
+
+bool ObuParser::ValidateInterFrameSize() const {
+ for (int index : frame_header_.reference_frame_index) {
+ const RefCountedBuffer* reference_frame =
+ decoder_state_.reference_frame[index].get();
+ if (2 * frame_header_.width < reference_frame->upscaled_width() ||
+ 2 * frame_header_.height < reference_frame->frame_height() ||
+ frame_header_.width > 16 * reference_frame->upscaled_width() ||
+ frame_header_.height > 16 * reference_frame->frame_height()) {
+ LIBGAV1_DLOG(ERROR,
+ "Invalid inter frame size: width=%d, height=%d. Reference "
+ "frame: index=%d, upscaled width=%d, height=%d.",
+ frame_header_.width, frame_header_.height, index,
+ reference_frame->upscaled_width(),
+ reference_frame->frame_height());
+ return false;
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseReferenceOrderHint() {
+ if (!frame_header_.error_resilient_mode ||
+ !sequence_header_.enable_order_hint) {
+ return true;
+ }
+ int64_t scratch;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+ frame_header_.reference_order_hint[i] = scratch;
+ if (frame_header_.reference_order_hint[i] !=
+ decoder_state_.reference_order_hint[i]) {
+ decoder_state_.reference_valid[i] = false;
+ }
+ }
+ return true;
+}
+
+// static
+int ObuParser::FindLatestBackwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+ int ref = -1;
+ int latest_order_hint = INT_MIN;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int hint = shifted_order_hints[i];
+ if (!used_frame[i] && hint >= current_frame_hint &&
+ hint >= latest_order_hint) {
+ ref = i;
+ latest_order_hint = hint;
+ }
+ }
+ return ref;
+}
+
+// static
+int ObuParser::FindEarliestBackwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+ int ref = -1;
+ int earliest_order_hint = INT_MAX;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int hint = shifted_order_hints[i];
+ if (!used_frame[i] && hint >= current_frame_hint &&
+ hint < earliest_order_hint) {
+ ref = i;
+ earliest_order_hint = hint;
+ }
+ }
+ return ref;
+}
+
+// static
+int ObuParser::FindLatestForwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+ int ref = -1;
+ int latest_order_hint = INT_MIN;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int hint = shifted_order_hints[i];
+ if (!used_frame[i] && hint < current_frame_hint &&
+ hint >= latest_order_hint) {
+ ref = i;
+ latest_order_hint = hint;
+ }
+ }
+ return ref;
+}
+
+// static
+int ObuParser::FindReferenceWithSmallestOutputOrder(
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints) {
+ int ref = -1;
+ int earliest_order_hint = INT_MAX;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int hint = shifted_order_hints[i];
+ if (hint < earliest_order_hint) {
+ ref = i;
+ earliest_order_hint = hint;
+ }
+ }
+ return ref;
+}
+
+// Computes the elements in the frame_header_.reference_frame_index array
+// based on:
+// * the syntax elements last_frame_idx and gold_frame_idx, and
+// * the values stored within the decoder_state_.reference_order_hint array
+// (these values represent the least significant bits of the expected output
+// order of the frames).
+//
+// Frame type: {
+// libgav1_name spec_name int
+// kReferenceFrameLast, LAST_FRAME 1
+// kReferenceFrameLast2, LAST2_FRAME 2
+// kReferenceFrameLast3, LAST3_FRAME 3
+// kReferenceFrameGolden, GOLDEN_FRAME 4
+// kReferenceFrameBackward, BWDREF_FRAME 5
+// kReferenceFrameAlternate2, ALTREF2_FRAME 6
+// kReferenceFrameAlternate, ALTREF_FRAME 7
+// }
+//
+// A typical case of a group of pictures (frames) in display order:
+// (However, more complex cases are possibly allowed in terms of
+// bitstream conformance.)
+//
+// | | | | | | | |
+// | | | | | | | |
+// | | | | | | | |
+// | | | | | | | |
+//
+// 4 3 2 1 current_frame 5 6 7
+//
+bool ObuParser::SetFrameReferences(const int8_t last_frame_idx,
+ const int8_t gold_frame_idx) {
+ // Set the ref_frame_idx entries for kReferenceFrameLast and
+ // kReferenceFrameGolden to last_frame_idx and gold_frame_idx. Initialize
+ // the other entries to -1.
+ for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+ reference_frame_index = -1;
+ }
+ frame_header_
+ .reference_frame_index[kReferenceFrameLast - kReferenceFrameLast] =
+ last_frame_idx;
+ frame_header_
+ .reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast] =
+ gold_frame_idx;
+
+ // used_frame records which reference frames have been used.
+ std::array<bool, kNumReferenceFrameTypes> used_frame;
+ used_frame.fill(false);
+ used_frame[last_frame_idx] = true;
+ used_frame[gold_frame_idx] = true;
+
+ assert(sequence_header_.order_hint_bits >= 1);
+ const int current_frame_hint = 1 << (sequence_header_.order_hint_bits - 1);
+ // shifted_order_hints contains the expected output order shifted such that
+ // the current frame has hint equal to current_frame_hint.
+ std::array<int, kNumReferenceFrameTypes> shifted_order_hints;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int relative_distance = GetRelativeDistance(
+ decoder_state_.reference_order_hint[i], frame_header_.order_hint,
+ sequence_header_.order_hint_shift_bits);
+ shifted_order_hints[i] = current_frame_hint + relative_distance;
+ }
+
+ // The expected output orders for kReferenceFrameLast and
+ // kReferenceFrameGolden.
+ const int last_order_hint = shifted_order_hints[last_frame_idx];
+ const int gold_order_hint = shifted_order_hints[gold_frame_idx];
+
+ // Section 7.8: It is a requirement of bitstream conformance that
+ // lastOrderHint and goldOrderHint are strictly less than curFrameHint.
+ if (last_order_hint >= current_frame_hint ||
+ gold_order_hint >= current_frame_hint) {
+ return false;
+ }
+
+ // Find a backward reference to the frame with highest output order. If
+ // found, set the kReferenceFrameAlternate reference to that backward
+ // reference.
+ int ref = FindLatestBackwardReference(current_frame_hint, shifted_order_hints,
+ used_frame);
+ if (ref >= 0) {
+ frame_header_
+ .reference_frame_index[kReferenceFrameAlternate - kReferenceFrameLast] =
+ ref;
+ used_frame[ref] = true;
+ }
+
+ // Find a backward reference to the closest frame. If found, set the
+ // kReferenceFrameBackward reference to that backward reference.
+ ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+ used_frame);
+ if (ref >= 0) {
+ frame_header_
+ .reference_frame_index[kReferenceFrameBackward - kReferenceFrameLast] =
+ ref;
+ used_frame[ref] = true;
+ }
+
+ // Set the kReferenceFrameAlternate2 reference to the next closest backward
+ // reference.
+ ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+ used_frame);
+ if (ref >= 0) {
+ frame_header_.reference_frame_index[kReferenceFrameAlternate2 -
+ kReferenceFrameLast] = ref;
+ used_frame[ref] = true;
+ }
+
+ // The remaining references are set to be forward references in
+ // reverse chronological order.
+ static constexpr ReferenceFrameType
+ kRefFrameList[kNumInterReferenceFrameTypes - 2] = {
+ kReferenceFrameLast2, kReferenceFrameLast3, kReferenceFrameBackward,
+ kReferenceFrameAlternate2, kReferenceFrameAlternate};
+ for (const ReferenceFrameType ref_frame : kRefFrameList) {
+ if (frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] <
+ 0) {
+ ref = FindLatestForwardReference(current_frame_hint, shifted_order_hints,
+ used_frame);
+ if (ref >= 0) {
+ frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] =
+ ref;
+ used_frame[ref] = true;
+ }
+ }
+ }
+
+ // Finally, any remaining references are set to the reference frame with
+ // smallest output order.
+ ref = FindReferenceWithSmallestOutputOrder(shifted_order_hints);
+ assert(ref >= 0);
+ for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+ if (reference_frame_index < 0) {
+ reference_frame_index = ref;
+ }
+ }
+
+ return true;
+}
+
+bool ObuParser::ParseLoopFilterParameters() {
+ LoopFilter* const loop_filter = &frame_header_.loop_filter;
+ if (frame_header_.coded_lossless || frame_header_.allow_intrabc) {
+ SetDefaultRefDeltas(loop_filter);
+ return true;
+ }
+ // IsIntraFrame implies kPrimaryReferenceNone.
+ assert(!IsIntraFrame(frame_header_.frame_type) ||
+ frame_header_.primary_reference_frame == kPrimaryReferenceNone);
+ if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+ // Part of the setup_past_independence() function in the spec. It is not
+ // necessary to set loop_filter->delta_enabled to true. See
+ // https://crbug.com/aomedia/2305.
+ SetDefaultRefDeltas(loop_filter);
+ } else {
+ // Part of the load_previous() function in the spec.
+ const int prev_frame_index =
+ frame_header_
+ .reference_frame_index[frame_header_.primary_reference_frame];
+ const RefCountedBuffer* prev_frame =
+ decoder_state_.reference_frame[prev_frame_index].get();
+ loop_filter->ref_deltas = prev_frame->loop_filter_ref_deltas();
+ loop_filter->mode_deltas = prev_frame->loop_filter_mode_deltas();
+ }
+ int64_t scratch;
+ for (int i = 0; i < 2; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(6);
+ loop_filter->level[i] = scratch;
+ }
+ if (!sequence_header_.color_config.is_monochrome &&
+ (loop_filter->level[0] != 0 || loop_filter->level[1] != 0)) {
+ for (int i = 2; i < 4; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(6);
+ loop_filter->level[i] = scratch;
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(3);
+ loop_filter->sharpness = scratch;
+ OBU_READ_BIT_OR_FAIL;
+ loop_filter->delta_enabled = static_cast<bool>(scratch);
+ if (loop_filter->delta_enabled) {
+ OBU_READ_BIT_OR_FAIL;
+ loop_filter->delta_update = static_cast<bool>(scratch);
+ if (loop_filter->delta_update) {
+ for (auto& ref_delta : loop_filter->ref_deltas) {
+ OBU_READ_BIT_OR_FAIL;
+ const auto update_ref_delta = static_cast<bool>(scratch);
+ if (update_ref_delta) {
+ int scratch_int;
+ if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ ref_delta = scratch_int;
+ }
+ }
+ for (auto& mode_delta : loop_filter->mode_deltas) {
+ OBU_READ_BIT_OR_FAIL;
+ const auto update_mode_delta = static_cast<bool>(scratch);
+ if (update_mode_delta) {
+ int scratch_int;
+ if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ mode_delta = scratch_int;
+ }
+ }
+ }
+ } else {
+ loop_filter->delta_update = false;
+ }
+ return true;
+}
+
+bool ObuParser::ParseDeltaQuantizer(int8_t* const delta) {
+ int64_t scratch;
+ *delta = 0;
+ OBU_READ_BIT_OR_FAIL;
+ const auto delta_coded = static_cast<bool>(scratch);
+ if (delta_coded) {
+ int scratch_int;
+ if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ *delta = scratch_int;
+ }
+ return true;
+}
+
+bool ObuParser::ParseQuantizerParameters() {
+ int64_t scratch;
+ QuantizerParameters* const quantizer = &frame_header_.quantizer;
+ OBU_READ_LITERAL_OR_FAIL(8);
+ quantizer->base_index = scratch;
+ if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneY])) return false;
+ if (!sequence_header_.color_config.is_monochrome) {
+ bool diff_uv_delta = false;
+ if (sequence_header_.color_config.separate_uv_delta_q) {
+ OBU_READ_BIT_OR_FAIL;
+ diff_uv_delta = static_cast<bool>(scratch);
+ }
+ if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneU]) ||
+ !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneU])) {
+ return false;
+ }
+ if (diff_uv_delta) {
+ if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneV]) ||
+ !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneV])) {
+ return false;
+ }
+ } else {
+ quantizer->delta_dc[kPlaneV] = quantizer->delta_dc[kPlaneU];
+ quantizer->delta_ac[kPlaneV] = quantizer->delta_ac[kPlaneU];
+ }
+ }
+ OBU_READ_BIT_OR_FAIL;
+ quantizer->use_matrix = static_cast<bool>(scratch);
+ if (quantizer->use_matrix) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ quantizer->matrix_level[kPlaneY] = scratch;
+ OBU_READ_LITERAL_OR_FAIL(4);
+ quantizer->matrix_level[kPlaneU] = scratch;
+ if (sequence_header_.color_config.separate_uv_delta_q) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ quantizer->matrix_level[kPlaneV] = scratch;
+ } else {
+ quantizer->matrix_level[kPlaneV] = quantizer->matrix_level[kPlaneU];
+ }
+ }
+ return true;
+}
+
+// This method implements the following functions in the spec:
+// - segmentation_params()
+// - part of setup_past_independence(): Set the FeatureData and FeatureEnabled
+// arrays to all 0.
+// - part of load_previous(): Call load_segmentation_params().
+//
+// A careful analysis of the spec shows the part of setup_past_independence()
+// can be optimized away and the part of load_previous() only needs to be
+// invoked under a specific condition. Although the logic looks different from
+// the spec, it is equivalent and more efficient.
+bool ObuParser::ParseSegmentationParameters() {
+ int64_t scratch;
+ Segmentation* const segmentation = &frame_header_.segmentation;
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->enabled = static_cast<bool>(scratch);
+ if (!segmentation->enabled) return true;
+ if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+ segmentation->update_map = true;
+ segmentation->update_data = true;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->update_map = static_cast<bool>(scratch);
+ if (segmentation->update_map) {
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->temporal_update = static_cast<bool>(scratch);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->update_data = static_cast<bool>(scratch);
+ if (!segmentation->update_data) {
+ // Part of the load_previous() function in the spec.
+ const int prev_frame_index =
+ frame_header_
+ .reference_frame_index[frame_header_.primary_reference_frame];
+ decoder_state_.reference_frame[prev_frame_index]
+ ->GetSegmentationParameters(segmentation);
+ return true;
+ }
+ }
+ for (int8_t i = 0; i < kMaxSegments; ++i) {
+ for (int8_t j = 0; j < kSegmentFeatureMax; ++j) {
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->feature_enabled[i][j] = static_cast<bool>(scratch);
+ if (segmentation->feature_enabled[i][j]) {
+ if (Segmentation::FeatureSigned(static_cast<SegmentFeature>(j))) {
+ int scratch_int;
+ if (!bit_reader_->ReadInverseSignedLiteral(
+ kSegmentationFeatureBits[j], &scratch_int)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ segmentation->feature_data[i][j] =
+ Clip3(scratch_int, -kSegmentationFeatureMaxValues[j],
+ kSegmentationFeatureMaxValues[j]);
+ } else {
+ if (kSegmentationFeatureBits[j] > 0) {
+ OBU_READ_LITERAL_OR_FAIL(kSegmentationFeatureBits[j]);
+ segmentation->feature_data[i][j] = Clip3(
+ static_cast<int>(scratch), 0, kSegmentationFeatureMaxValues[j]);
+ } else {
+ segmentation->feature_data[i][j] = 0;
+ }
+ }
+ segmentation->last_active_segment_id = i;
+ if (j >= kSegmentFeatureReferenceFrame) {
+ segmentation->segment_id_pre_skip = true;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseQuantizerIndexDeltaParameters() {
+ int64_t scratch;
+ if (frame_header_.quantizer.base_index > 0) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.delta_q.present = static_cast<bool>(scratch);
+ if (frame_header_.delta_q.present) {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ frame_header_.delta_q.scale = scratch;
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseLoopFilterDeltaParameters() {
+ int64_t scratch;
+ if (frame_header_.delta_q.present) {
+ if (!frame_header_.allow_intrabc) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.delta_lf.present = static_cast<bool>(scratch);
+ }
+ if (frame_header_.delta_lf.present) {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ frame_header_.delta_lf.scale = scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.delta_lf.multi = static_cast<bool>(scratch);
+ }
+ }
+ return true;
+}
+
+void ObuParser::ComputeSegmentLosslessAndQIndex() {
+ frame_header_.coded_lossless = true;
+ Segmentation* const segmentation = &frame_header_.segmentation;
+ const QuantizerParameters* const quantizer = &frame_header_.quantizer;
+ for (int i = 0; i < kMaxSegments; ++i) {
+ segmentation->qindex[i] =
+ GetQIndex(*segmentation, i, quantizer->base_index);
+ segmentation->lossless[i] =
+ segmentation->qindex[i] == 0 && quantizer->delta_dc[kPlaneY] == 0 &&
+ quantizer->delta_dc[kPlaneU] == 0 &&
+ quantizer->delta_ac[kPlaneU] == 0 &&
+ quantizer->delta_dc[kPlaneV] == 0 && quantizer->delta_ac[kPlaneV] == 0;
+ if (!segmentation->lossless[i]) frame_header_.coded_lossless = false;
+ // The spec calls for setting up a two-dimensional SegQMLevel array here.
+ // We avoid the SegQMLevel array by using segmentation->lossless[i] and
+ // quantizer->matrix_level[plane] directly in the reconstruct process of
+ // Section 7.12.3.
+ }
+ frame_header_.upscaled_lossless =
+ frame_header_.coded_lossless &&
+ frame_header_.width == frame_header_.upscaled_width;
+}
+
+bool ObuParser::ParseCdefParameters() {
+ const int coeff_shift = sequence_header_.color_config.bitdepth - 8;
+ if (frame_header_.coded_lossless || frame_header_.allow_intrabc ||
+ !sequence_header_.enable_cdef) {
+ frame_header_.cdef.damping = 3 + coeff_shift;
+ return true;
+ }
+ Cdef* const cdef = &frame_header_.cdef;
+ int64_t scratch;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ cdef->damping = scratch + 3 + coeff_shift;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ cdef->bits = scratch;
+ for (int i = 0; i < (1 << cdef->bits); ++i) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ cdef->y_primary_strength[i] = scratch << coeff_shift;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ cdef->y_secondary_strength[i] = scratch;
+ if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i];
+ cdef->y_secondary_strength[i] <<= coeff_shift;
+ if (sequence_header_.color_config.is_monochrome) continue;
+ OBU_READ_LITERAL_OR_FAIL(4);
+ cdef->uv_primary_strength[i] = scratch << coeff_shift;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ cdef->uv_secondary_strength[i] = scratch;
+ if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i];
+ cdef->uv_secondary_strength[i] <<= coeff_shift;
+ }
+ return true;
+}
+
+bool ObuParser::ParseLoopRestorationParameters() {
+ if (frame_header_.upscaled_lossless || frame_header_.allow_intrabc ||
+ !sequence_header_.enable_restoration) {
+ return true;
+ }
+ int64_t scratch;
+ bool uses_loop_restoration = false;
+ bool uses_chroma_loop_restoration = false;
+ LoopRestoration* const loop_restoration = &frame_header_.loop_restoration;
+ const int num_planes = sequence_header_.color_config.is_monochrome
+ ? kMaxPlanesMonochrome
+ : kMaxPlanes;
+ for (int i = 0; i < num_planes; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ loop_restoration->type[i] = static_cast<LoopRestorationType>(scratch);
+ if (loop_restoration->type[i] != kLoopRestorationTypeNone) {
+ uses_loop_restoration = true;
+ if (i > 0) uses_chroma_loop_restoration = true;
+ }
+ }
+ if (uses_loop_restoration) {
+ uint8_t unit_shift;
+ if (sequence_header_.use_128x128_superblock) {
+ OBU_READ_BIT_OR_FAIL;
+ unit_shift = scratch + 1;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ unit_shift = scratch;
+ if (unit_shift != 0) {
+ OBU_READ_BIT_OR_FAIL;
+ const uint8_t unit_extra_shift = scratch;
+ unit_shift += unit_extra_shift;
+ }
+ }
+ loop_restoration->unit_size_log2[kPlaneY] = 6 + unit_shift;
+ uint8_t uv_shift = 0;
+ if (sequence_header_.color_config.subsampling_x != 0 &&
+ sequence_header_.color_config.subsampling_y != 0 &&
+ uses_chroma_loop_restoration) {
+ OBU_READ_BIT_OR_FAIL;
+ uv_shift = scratch;
+ }
+ loop_restoration->unit_size_log2[kPlaneU] =
+ loop_restoration->unit_size_log2[kPlaneV] =
+ loop_restoration->unit_size_log2[0] - uv_shift;
+ }
+ return true;
+}
+
+bool ObuParser::ParseTxModeSyntax() {
+ if (frame_header_.coded_lossless) {
+ frame_header_.tx_mode = kTxModeOnly4x4;
+ return true;
+ }
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.tx_mode = (scratch == 1) ? kTxModeSelect : kTxModeLargest;
+ return true;
+}
+
+bool ObuParser::ParseFrameReferenceModeSyntax() {
+ int64_t scratch;
+ if (!IsIntraFrame(frame_header_.frame_type)) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.reference_mode_select = static_cast<bool>(scratch);
+ }
+ return true;
+}
+
+bool ObuParser::IsSkipModeAllowed() {
+ if (IsIntraFrame(frame_header_.frame_type) ||
+ !frame_header_.reference_mode_select ||
+ !sequence_header_.enable_order_hint) {
+ return false;
+ }
+ // Identify the nearest forward and backward references.
+ int forward_index = -1;
+ int backward_index = -1;
+ int forward_hint = -1;
+ int backward_hint = -1;
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ const unsigned int reference_hint =
+ decoder_state_
+ .reference_order_hint[frame_header_.reference_frame_index[i]];
+ // TODO(linfengz): |relative_distance| equals
+ // current_frame_->reference_info()->
+ // relative_distance_from[i + kReferenceFrameLast];
+ // However, the unit test ObuParserTest.SkipModeParameters() would fail.
+ // Will figure out how to initialize |current_frame_.reference_info_| in the
+ // RefCountedBuffer later.
+ const int relative_distance =
+ GetRelativeDistance(reference_hint, frame_header_.order_hint,
+ sequence_header_.order_hint_shift_bits);
+ if (relative_distance < 0) {
+ if (forward_index < 0 ||
+ GetRelativeDistance(reference_hint, forward_hint,
+ sequence_header_.order_hint_shift_bits) > 0) {
+ forward_index = i;
+ forward_hint = reference_hint;
+ }
+ } else if (relative_distance > 0) {
+ if (backward_index < 0 ||
+ GetRelativeDistance(reference_hint, backward_hint,
+ sequence_header_.order_hint_shift_bits) < 0) {
+ backward_index = i;
+ backward_hint = reference_hint;
+ }
+ }
+ }
+ if (forward_index < 0) return false;
+ if (backward_index >= 0) {
+ // Bidirectional prediction.
+ frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+ kReferenceFrameLast + std::min(forward_index, backward_index));
+ frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+ kReferenceFrameLast + std::max(forward_index, backward_index));
+ return true;
+ }
+ // Forward prediction only. Identify the second nearest forward reference.
+ int second_forward_index = -1;
+ int second_forward_hint = -1;
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ const unsigned int reference_hint =
+ decoder_state_
+ .reference_order_hint[frame_header_.reference_frame_index[i]];
+ if (GetRelativeDistance(reference_hint, forward_hint,
+ sequence_header_.order_hint_shift_bits) < 0) {
+ if (second_forward_index < 0 ||
+ GetRelativeDistance(reference_hint, second_forward_hint,
+ sequence_header_.order_hint_shift_bits) > 0) {
+ second_forward_index = i;
+ second_forward_hint = reference_hint;
+ }
+ }
+ }
+ if (second_forward_index < 0) return false;
+ frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+ kReferenceFrameLast + std::min(forward_index, second_forward_index));
+ frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+ kReferenceFrameLast + std::max(forward_index, second_forward_index));
+ return true;
+}
+
+bool ObuParser::ParseSkipModeParameters() {
+ if (!IsSkipModeAllowed()) return true;
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.skip_mode_present = static_cast<bool>(scratch);
+ return true;
+}
+
+// Sets frame_header_.global_motion[ref].params[index].
+bool ObuParser::ParseGlobalParamSyntax(
+ int ref, int index,
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+ prev_global_motions) {
+ GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+ const GlobalMotion* const prev_global_motion = &prev_global_motions[ref];
+ int abs_bits = kGlobalMotionAlphaBits;
+ int precision_bits = kGlobalMotionAlphaPrecisionBits;
+ if (index < 2) {
+ if (global_motion->type == kGlobalMotionTransformationTypeTranslation) {
+ const auto high_precision_mv_factor =
+ static_cast<int>(!frame_header_.allow_high_precision_mv);
+ abs_bits = kGlobalMotionTranslationOnlyBits - high_precision_mv_factor;
+ precision_bits =
+ kGlobalMotionTranslationOnlyPrecisionBits - high_precision_mv_factor;
+ } else {
+ abs_bits = kGlobalMotionTranslationBits;
+ precision_bits = kGlobalMotionTranslationPrecisionBits;
+ }
+ }
+ const int precision_diff = kWarpedModelPrecisionBits - precision_bits;
+ const int round = (index % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+ const int sub = (index % 3 == 2) ? 1 << precision_bits : 0;
+ const int mx = 1 << abs_bits;
+ const int reference =
+ (prev_global_motion->params[index] >> precision_diff) - sub;
+ int scratch;
+ if (!bit_reader_->DecodeSignedSubexpWithReference(
+ -mx, mx + 1, reference, kGlobalMotionReadControl, &scratch)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ global_motion->params[index] = LeftShift(scratch, precision_diff) + round;
+ return true;
+}
+
+bool ObuParser::ParseGlobalMotionParameters() {
+ for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+ frame_header_.global_motion[ref].type =
+ kGlobalMotionTransformationTypeIdentity;
+ for (int i = 0; i < 6; ++i) {
+ frame_header_.global_motion[ref].params[i] =
+ (i % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+ }
+ }
+ if (IsIntraFrame(frame_header_.frame_type)) return true;
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>* prev_global_motions =
+ nullptr;
+ if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+ // Part of the setup_past_independence() function in the spec. The value
+ // that the spec says PrevGmParams[ref][i] should be set to is exactly
+ // the value frame_header_.global_motion[ref].params[i] is set to by the
+ // for loop above. Therefore prev_global_motions can simply point to
+ // frame_header_.global_motion.
+ prev_global_motions = &frame_header_.global_motion;
+ } else {
+ // Part of the load_previous() function in the spec.
+ const int prev_frame_index =
+ frame_header_
+ .reference_frame_index[frame_header_.primary_reference_frame];
+ prev_global_motions =
+ &decoder_state_.reference_frame[prev_frame_index]->GlobalMotions();
+ }
+ for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+ GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ const auto is_global = static_cast<bool>(scratch);
+ if (is_global) {
+ OBU_READ_BIT_OR_FAIL;
+ const auto is_rot_zoom = static_cast<bool>(scratch);
+ if (is_rot_zoom) {
+ global_motion->type = kGlobalMotionTransformationTypeRotZoom;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ const auto is_translation = static_cast<bool>(scratch);
+ global_motion->type = is_translation
+ ? kGlobalMotionTransformationTypeTranslation
+ : kGlobalMotionTransformationTypeAffine;
+ }
+ } else {
+ global_motion->type = kGlobalMotionTransformationTypeIdentity;
+ }
+ if (global_motion->type >= kGlobalMotionTransformationTypeRotZoom) {
+ if (!ParseGlobalParamSyntax(ref, 2, *prev_global_motions) ||
+ !ParseGlobalParamSyntax(ref, 3, *prev_global_motions)) {
+ return false;
+ }
+ if (global_motion->type == kGlobalMotionTransformationTypeAffine) {
+ if (!ParseGlobalParamSyntax(ref, 4, *prev_global_motions) ||
+ !ParseGlobalParamSyntax(ref, 5, *prev_global_motions)) {
+ return false;
+ }
+ } else {
+ global_motion->params[4] = -global_motion->params[3];
+ global_motion->params[5] = global_motion->params[2];
+ }
+ }
+ if (global_motion->type >= kGlobalMotionTransformationTypeTranslation) {
+ if (!ParseGlobalParamSyntax(ref, 0, *prev_global_motions) ||
+ !ParseGlobalParamSyntax(ref, 1, *prev_global_motions)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseFilmGrainParameters() {
+ if (!sequence_header_.film_grain_params_present ||
+ (!frame_header_.show_frame && !frame_header_.showable_frame)) {
+ // frame_header_.film_grain_params is already zero-initialized.
+ return true;
+ }
+
+ FilmGrainParams& film_grain_params = frame_header_.film_grain_params;
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.apply_grain = static_cast<bool>(scratch);
+ if (!film_grain_params.apply_grain) {
+ // film_grain_params is already zero-initialized.
+ return true;
+ }
+
+ OBU_READ_LITERAL_OR_FAIL(16);
+ film_grain_params.grain_seed = static_cast<int>(scratch);
+ film_grain_params.update_grain = true;
+ if (frame_header_.frame_type == kFrameInter) {
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.update_grain = static_cast<bool>(scratch);
+ }
+ if (!film_grain_params.update_grain) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ film_grain_params.reference_index = static_cast<int>(scratch);
+ bool found = false;
+ for (const auto index : frame_header_.reference_frame_index) {
+ if (film_grain_params.reference_index == index) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ static_assert(sizeof(frame_header_.reference_frame_index) /
+ sizeof(frame_header_.reference_frame_index[0]) ==
+ 7,
+ "");
+ LIBGAV1_DLOG(ERROR,
+ "Invalid value for film_grain_params_ref_idx (%d). "
+ "ref_frame_idx = {%d, %d, %d, %d, %d, %d, %d}",
+ film_grain_params.reference_index,
+ frame_header_.reference_frame_index[0],
+ frame_header_.reference_frame_index[1],
+ frame_header_.reference_frame_index[2],
+ frame_header_.reference_frame_index[3],
+ frame_header_.reference_frame_index[4],
+ frame_header_.reference_frame_index[5],
+ frame_header_.reference_frame_index[6]);
+ return false;
+ }
+ const RefCountedBuffer* grain_params_reference_frame =
+ decoder_state_.reference_frame[film_grain_params.reference_index].get();
+ if (grain_params_reference_frame == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+ film_grain_params.reference_index);
+ return false;
+ }
+ const int temp_grain_seed = film_grain_params.grain_seed;
+ const bool temp_update_grain = film_grain_params.update_grain;
+ const int temp_reference_index = film_grain_params.reference_index;
+ film_grain_params = grain_params_reference_frame->film_grain_params();
+ film_grain_params.grain_seed = temp_grain_seed;
+ film_grain_params.update_grain = temp_update_grain;
+ film_grain_params.reference_index = temp_reference_index;
+ return true;
+ }
+
+ OBU_READ_LITERAL_OR_FAIL(4);
+ film_grain_params.num_y_points = scratch;
+ if (film_grain_params.num_y_points > 14) {
+ LIBGAV1_DLOG(ERROR, "Invalid value for num_y_points (%d).",
+ film_grain_params.num_y_points);
+ return false;
+ }
+ for (int i = 0; i < film_grain_params.num_y_points; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_y_value[i] = scratch;
+ if (i != 0 && film_grain_params.point_y_value[i - 1] >=
+ film_grain_params.point_y_value[i]) {
+ LIBGAV1_DLOG(ERROR, "point_y_value[%d] (%d) >= point_y_value[%d] (%d).",
+ i - 1, film_grain_params.point_y_value[i - 1], i,
+ film_grain_params.point_y_value[i]);
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_y_scaling[i] = scratch;
+ }
+ if (sequence_header_.color_config.is_monochrome) {
+ film_grain_params.chroma_scaling_from_luma = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.chroma_scaling_from_luma = static_cast<bool>(scratch);
+ }
+ if (sequence_header_.color_config.is_monochrome ||
+ film_grain_params.chroma_scaling_from_luma ||
+ (sequence_header_.color_config.subsampling_x == 1 &&
+ sequence_header_.color_config.subsampling_y == 1 &&
+ film_grain_params.num_y_points == 0)) {
+ film_grain_params.num_u_points = 0;
+ film_grain_params.num_v_points = 0;
+ } else {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ film_grain_params.num_u_points = scratch;
+ if (film_grain_params.num_u_points > 10) {
+ LIBGAV1_DLOG(ERROR, "Invalid value for num_u_points (%d).",
+ film_grain_params.num_u_points);
+ return false;
+ }
+ for (int i = 0; i < film_grain_params.num_u_points; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_u_value[i] = scratch;
+ if (i != 0 && film_grain_params.point_u_value[i - 1] >=
+ film_grain_params.point_u_value[i]) {
+ LIBGAV1_DLOG(ERROR, "point_u_value[%d] (%d) >= point_u_value[%d] (%d).",
+ i - 1, film_grain_params.point_u_value[i - 1], i,
+ film_grain_params.point_u_value[i]);
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_u_scaling[i] = scratch;
+ }
+ OBU_READ_LITERAL_OR_FAIL(4);
+ film_grain_params.num_v_points = scratch;
+ if (film_grain_params.num_v_points > 10) {
+ LIBGAV1_DLOG(ERROR, "Invalid value for num_v_points (%d).",
+ film_grain_params.num_v_points);
+ return false;
+ }
+ if (sequence_header_.color_config.subsampling_x == 1 &&
+ sequence_header_.color_config.subsampling_y == 1 &&
+ (film_grain_params.num_u_points == 0) !=
+ (film_grain_params.num_v_points == 0)) {
+ LIBGAV1_DLOG(ERROR,
+ "Invalid values for num_u_points (%d) and num_v_points (%d) "
+ "for 4:2:0 chroma subsampling.",
+ film_grain_params.num_u_points,
+ film_grain_params.num_v_points);
+ return false;
+ }
+ for (int i = 0; i < film_grain_params.num_v_points; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_v_value[i] = scratch;
+ if (i != 0 && film_grain_params.point_v_value[i - 1] >=
+ film_grain_params.point_v_value[i]) {
+ LIBGAV1_DLOG(ERROR, "point_v_value[%d] (%d) >= point_v_value[%d] (%d).",
+ i - 1, film_grain_params.point_v_value[i - 1], i,
+ film_grain_params.point_v_value[i]);
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_v_scaling[i] = scratch;
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(2);
+ film_grain_params.chroma_scaling = scratch + 8;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ film_grain_params.auto_regression_coeff_lag = scratch;
+
+ const int num_pos_y =
+ MultiplyBy2(film_grain_params.auto_regression_coeff_lag) *
+ (film_grain_params.auto_regression_coeff_lag + 1);
+ int num_pos_uv = num_pos_y;
+ if (film_grain_params.num_y_points > 0) {
+ ++num_pos_uv;
+ for (int i = 0; i < num_pos_y; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.auto_regression_coeff_y[i] =
+ static_cast<int8_t>(scratch - 128);
+ }
+ }
+ if (film_grain_params.chroma_scaling_from_luma ||
+ film_grain_params.num_u_points > 0) {
+ for (int i = 0; i < num_pos_uv; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.auto_regression_coeff_u[i] =
+ static_cast<int8_t>(scratch - 128);
+ }
+ }
+ if (film_grain_params.chroma_scaling_from_luma ||
+ film_grain_params.num_v_points > 0) {
+ for (int i = 0; i < num_pos_uv; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.auto_regression_coeff_v[i] =
+ static_cast<int8_t>(scratch - 128);
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(2);
+ film_grain_params.auto_regression_shift = static_cast<uint8_t>(scratch + 6);
+ OBU_READ_LITERAL_OR_FAIL(2);
+ film_grain_params.grain_scale_shift = static_cast<int>(scratch);
+ if (film_grain_params.num_u_points > 0) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.u_multiplier = static_cast<int8_t>(scratch - 128);
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.u_luma_multiplier = static_cast<int8_t>(scratch - 128);
+ OBU_READ_LITERAL_OR_FAIL(9);
+ film_grain_params.u_offset = static_cast<int16_t>(scratch - 256);
+ }
+ if (film_grain_params.num_v_points > 0) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.v_multiplier = static_cast<int8_t>(scratch - 128);
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.v_luma_multiplier = static_cast<int8_t>(scratch - 128);
+ OBU_READ_LITERAL_OR_FAIL(9);
+ film_grain_params.v_offset = static_cast<int16_t>(scratch - 256);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.overlap_flag = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.clip_to_restricted_range = static_cast<bool>(scratch);
+ return true;
+}
+
+bool ObuParser::ParseTileInfoSyntax() {
+ TileInfo* const tile_info = &frame_header_.tile_info;
+ const int sb_columns = sequence_header_.use_128x128_superblock
+ ? ((frame_header_.columns4x4 + 31) >> 5)
+ : ((frame_header_.columns4x4 + 15) >> 4);
+ const int sb_rows = sequence_header_.use_128x128_superblock
+ ? ((frame_header_.rows4x4 + 31) >> 5)
+ : ((frame_header_.rows4x4 + 15) >> 4);
+ tile_info->sb_columns = sb_columns;
+ tile_info->sb_rows = sb_rows;
+ const int sb_shift = sequence_header_.use_128x128_superblock ? 5 : 4;
+ const int sb_size = 2 + sb_shift;
+ const int sb_max_tile_width = kMaxTileWidth >> sb_size;
+ const int sb_max_tile_area = kMaxTileArea >> MultiplyBy2(sb_size);
+ const int minlog2_tile_columns = TileLog2(sb_max_tile_width, sb_columns);
+ const int maxlog2_tile_columns =
+ CeilLog2(std::min(sb_columns, static_cast<int>(kMaxTileColumns)));
+ const int maxlog2_tile_rows =
+ CeilLog2(std::min(sb_rows, static_cast<int>(kMaxTileRows)));
+ const int min_log2_tiles = std::max(
+ minlog2_tile_columns, TileLog2(sb_max_tile_area, sb_rows * sb_columns));
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ tile_info->uniform_spacing = static_cast<bool>(scratch);
+ if (tile_info->uniform_spacing) {
+ // Read tile columns.
+ tile_info->tile_columns_log2 = minlog2_tile_columns;
+ while (tile_info->tile_columns_log2 < maxlog2_tile_columns) {
+ OBU_READ_BIT_OR_FAIL;
+ if (scratch == 0) break;
+ ++tile_info->tile_columns_log2;
+ }
+
+ // Compute tile column starts.
+ const int sb_tile_width =
+ (sb_columns + (1 << tile_info->tile_columns_log2) - 1) >>
+ tile_info->tile_columns_log2;
+ if (sb_tile_width <= 0) return false;
+ int i = 0;
+ for (int sb_start = 0; sb_start < sb_columns; sb_start += sb_tile_width) {
+ if (i >= kMaxTileColumns) {
+ LIBGAV1_DLOG(ERROR,
+ "tile_columns would be greater than kMaxTileColumns.");
+ return false;
+ }
+ tile_info->tile_column_start[i++] = sb_start << sb_shift;
+ }
+ tile_info->tile_column_start[i] = frame_header_.columns4x4;
+ tile_info->tile_columns = i;
+
+ // Read tile rows.
+ const int minlog2_tile_rows =
+ std::max(min_log2_tiles - tile_info->tile_columns_log2, 0);
+ tile_info->tile_rows_log2 = minlog2_tile_rows;
+ while (tile_info->tile_rows_log2 < maxlog2_tile_rows) {
+ OBU_READ_BIT_OR_FAIL;
+ if (scratch == 0) break;
+ ++tile_info->tile_rows_log2;
+ }
+
+ // Compute tile row starts.
+ const int sb_tile_height =
+ (sb_rows + (1 << tile_info->tile_rows_log2) - 1) >>
+ tile_info->tile_rows_log2;
+ if (sb_tile_height <= 0) return false;
+ i = 0;
+ for (int sb_start = 0; sb_start < sb_rows; sb_start += sb_tile_height) {
+ if (i >= kMaxTileRows) {
+ LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+ return false;
+ }
+ tile_info->tile_row_start[i++] = sb_start << sb_shift;
+ }
+ tile_info->tile_row_start[i] = frame_header_.rows4x4;
+ tile_info->tile_rows = i;
+ } else {
+ int widest_tile_sb = 1;
+ int i = 0;
+ for (int sb_start = 0; sb_start < sb_columns; ++i) {
+ if (i >= kMaxTileColumns) {
+ LIBGAV1_DLOG(ERROR,
+ "tile_columns would be greater than kMaxTileColumns.");
+ return false;
+ }
+ tile_info->tile_column_start[i] = sb_start << sb_shift;
+ const int max_width =
+ std::min(sb_columns - sb_start, static_cast<int>(sb_max_tile_width));
+ if (!bit_reader_->DecodeUniform(
+ max_width, &tile_info->tile_column_width_in_superblocks[i])) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ ++tile_info->tile_column_width_in_superblocks[i];
+ widest_tile_sb = std::max(tile_info->tile_column_width_in_superblocks[i],
+ widest_tile_sb);
+ sb_start += tile_info->tile_column_width_in_superblocks[i];
+ }
+ tile_info->tile_column_start[i] = frame_header_.columns4x4;
+ tile_info->tile_columns = i;
+ tile_info->tile_columns_log2 = CeilLog2(tile_info->tile_columns);
+
+ int max_tile_area_sb = sb_rows * sb_columns;
+ if (min_log2_tiles > 0) max_tile_area_sb >>= min_log2_tiles + 1;
+ const int max_tile_height_sb =
+ std::max(max_tile_area_sb / widest_tile_sb, 1);
+
+ i = 0;
+ for (int sb_start = 0; sb_start < sb_rows; ++i) {
+ if (i >= kMaxTileRows) {
+ LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+ return false;
+ }
+ tile_info->tile_row_start[i] = sb_start << sb_shift;
+ const int max_height = std::min(sb_rows - sb_start, max_tile_height_sb);
+ if (!bit_reader_->DecodeUniform(
+ max_height, &tile_info->tile_row_height_in_superblocks[i])) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ ++tile_info->tile_row_height_in_superblocks[i];
+ sb_start += tile_info->tile_row_height_in_superblocks[i];
+ }
+ tile_info->tile_row_start[i] = frame_header_.rows4x4;
+ tile_info->tile_rows = i;
+ tile_info->tile_rows_log2 = CeilLog2(tile_info->tile_rows);
+ }
+ tile_info->tile_count = tile_info->tile_rows * tile_info->tile_columns;
+ if (!tile_buffers_.reserve(tile_info->tile_count)) {
+ LIBGAV1_DLOG(ERROR, "Unable to allocate memory for tile_buffers_.");
+ return false;
+ }
+ tile_info->context_update_id = 0;
+ const int tile_bits =
+ tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+ if (tile_bits != 0) {
+ OBU_READ_LITERAL_OR_FAIL(tile_bits);
+ tile_info->context_update_id = static_cast<int16_t>(scratch);
+ if (tile_info->context_update_id >= tile_info->tile_count) {
+ LIBGAV1_DLOG(ERROR, "Invalid context_update_tile_id (%d) >= %d.",
+ tile_info->context_update_id, tile_info->tile_count);
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(2);
+ tile_info->tile_size_bytes = 1 + scratch;
+ }
+ return true;
+}
+
+bool ObuParser::ReadAllowWarpedMotion() {
+ if (IsIntraFrame(frame_header_.frame_type) ||
+ frame_header_.error_resilient_mode ||
+ !sequence_header_.enable_warped_motion) {
+ return true;
+ }
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.allow_warped_motion = static_cast<bool>(scratch);
+ return true;
+}
+
+bool ObuParser::ParseFrameParameters() {
+ int64_t scratch;
+ if (sequence_header_.reduced_still_picture_header) {
+ frame_header_.show_frame = true;
+ current_frame_ = buffer_pool_->GetFreeBuffer();
+ if (current_frame_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+ return false;
+ }
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.show_existing_frame = static_cast<bool>(scratch);
+ if (frame_header_.show_existing_frame) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ frame_header_.frame_to_show = scratch;
+ if (sequence_header_.decoder_model_info_present_flag &&
+ !sequence_header_.timing_info.equal_picture_interval) {
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header_.decoder_model_info.frame_presentation_time_length);
+ frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+ }
+ if (sequence_header_.frame_id_numbers_present) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+ frame_header_.display_frame_id = static_cast<uint16_t>(scratch);
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // whenever display_frame_id is read, the value matches
+ // RefFrameId[ frame_to_show_map_idx ] ..., and that
+ // RefValid[ frame_to_show_map_idx ] is equal to 1.
+ if (frame_header_.display_frame_id !=
+ decoder_state_
+ .reference_frame_id[frame_header_.frame_to_show] ||
+ !decoder_state_.reference_valid[frame_header_.frame_to_show]) {
+ LIBGAV1_DLOG(ERROR,
+ "Reference buffer %d has a frame id number mismatch.",
+ frame_header_.frame_to_show);
+ return false;
+ }
+ }
+ // Section 7.18.2. Note: This is also needed for Section 7.21 if
+ // frame_type is kFrameKey.
+ current_frame_ =
+ decoder_state_.reference_frame[frame_header_.frame_to_show];
+ if (current_frame_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+ frame_header_.frame_to_show);
+ return false;
+ }
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // when show_existing_frame is used to show a previous frame, that the
+ // value of showable_frame for the previous frame was equal to 1.
+ if (!current_frame_->showable_frame()) {
+ LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a showable frame",
+ frame_header_.frame_to_show);
+ return false;
+ }
+ if (current_frame_->frame_type() == kFrameKey) {
+ frame_header_.refresh_frame_flags = 0xff;
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // when show_existing_frame is used to show a previous frame with
+ // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that
+ // the frame is output via the show_existing_frame mechanism at most
+ // once.
+ current_frame_->set_showable_frame(false);
+
+ // Section 7.21. Note: decoder_state_.current_frame_id must be set
+ // only when frame_type is kFrameKey per the spec. Among all the
+ // variables set in Section 7.21, current_frame_id is the only one
+ // whose value lives across frames. (PrevFrameID is set equal to the
+ // current_frame_id value for the previous frame.)
+ decoder_state_.current_frame_id =
+ decoder_state_.reference_frame_id[frame_header_.frame_to_show];
+ decoder_state_.order_hint =
+ decoder_state_.reference_order_hint[frame_header_.frame_to_show];
+ }
+ return true;
+ }
+ current_frame_ = buffer_pool_->GetFreeBuffer();
+ if (current_frame_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(2);
+ frame_header_.frame_type = static_cast<FrameType>(scratch);
+ current_frame_->set_frame_type(frame_header_.frame_type);
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.show_frame = static_cast<bool>(scratch);
+ if (frame_header_.show_frame &&
+ sequence_header_.decoder_model_info_present_flag &&
+ !sequence_header_.timing_info.equal_picture_interval) {
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header_.decoder_model_info.frame_presentation_time_length);
+ frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+ }
+ if (frame_header_.show_frame) {
+ frame_header_.showable_frame = (frame_header_.frame_type != kFrameKey);
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.showable_frame = static_cast<bool>(scratch);
+ }
+ current_frame_->set_showable_frame(frame_header_.showable_frame);
+ if (frame_header_.frame_type == kFrameSwitch ||
+ (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+ frame_header_.error_resilient_mode = true;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.error_resilient_mode = static_cast<bool>(scratch);
+ }
+ }
+ if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
+ decoder_state_.reference_valid.fill(false);
+ decoder_state_.reference_order_hint.fill(0);
+ decoder_state_.reference_frame.fill(nullptr);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.enable_cdf_update = !static_cast<bool>(scratch);
+ if (sequence_header_.force_screen_content_tools ==
+ kSelectScreenContentTools) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.allow_screen_content_tools = static_cast<bool>(scratch);
+ } else {
+ frame_header_.allow_screen_content_tools =
+ static_cast<bool>(sequence_header_.force_screen_content_tools);
+ }
+ if (frame_header_.allow_screen_content_tools) {
+ if (sequence_header_.force_integer_mv == kSelectIntegerMv) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.force_integer_mv = scratch;
+ } else {
+ frame_header_.force_integer_mv = sequence_header_.force_integer_mv;
+ }
+ } else {
+ frame_header_.force_integer_mv = 0;
+ }
+ if (IsIntraFrame(frame_header_.frame_type)) {
+ frame_header_.force_integer_mv = 1;
+ }
+ if (sequence_header_.frame_id_numbers_present) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+ frame_header_.current_frame_id = static_cast<uint16_t>(scratch);
+ const int previous_frame_id = decoder_state_.current_frame_id;
+ decoder_state_.current_frame_id = frame_header_.current_frame_id;
+ if (frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) {
+ if (previous_frame_id >= 0) {
+ // Section 6.8.2: ..., it is a requirement of bitstream conformance
+ // that all of the following conditions are true:
+ // * current_frame_id is not equal to PrevFrameID,
+ // * DiffFrameID is less than 1 << ( idLen - 1 )
+ int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
+ const int id_length_max_value =
+ 1 << sequence_header_.frame_id_length_bits;
+ if (diff_frame_id <= 0) {
+ diff_frame_id += id_length_max_value;
+ }
+ if (diff_frame_id >= DivideBy2(id_length_max_value)) {
+ LIBGAV1_DLOG(ERROR,
+ "current_frame_id (%d) equals or differs too much from "
+ "previous_frame_id (%d).",
+ decoder_state_.current_frame_id, previous_frame_id);
+ return false;
+ }
+ }
+ MarkInvalidReferenceFrames();
+ }
+ } else {
+ frame_header_.current_frame_id = 0;
+ decoder_state_.current_frame_id = frame_header_.current_frame_id;
+ }
+ if (frame_header_.frame_type == kFrameSwitch) {
+ frame_header_.frame_size_override_flag = true;
+ } else if (!sequence_header_.reduced_still_picture_header) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.frame_size_override_flag = static_cast<bool>(scratch);
+ }
+ if (sequence_header_.order_hint_bits > 0) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+ frame_header_.order_hint = scratch;
+ }
+ decoder_state_.order_hint = frame_header_.order_hint;
+ if (IsIntraFrame(frame_header_.frame_type) ||
+ frame_header_.error_resilient_mode) {
+ frame_header_.primary_reference_frame = kPrimaryReferenceNone;
+ } else {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ frame_header_.primary_reference_frame = scratch;
+ }
+ if (sequence_header_.decoder_model_info_present_flag) {
+ OBU_READ_BIT_OR_FAIL;
+ const auto buffer_removal_time_present = static_cast<bool>(scratch);
+ if (buffer_removal_time_present) {
+ for (int i = 0; i < sequence_header_.operating_points; ++i) {
+ if (!sequence_header_.decoder_model_present_for_operating_point[i]) {
+ continue;
+ }
+ const int index = sequence_header_.operating_point_idc[i];
+ if (index == 0 ||
+ (InTemporalLayer(index, obu_headers_.back().temporal_id) &&
+ InSpatialLayer(index, obu_headers_.back().spatial_id))) {
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header_.decoder_model_info.buffer_removal_time_length);
+ frame_header_.buffer_removal_time[i] = static_cast<uint32_t>(scratch);
+ }
+ }
+ }
+ }
+ if (frame_header_.frame_type == kFrameSwitch ||
+ (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+ frame_header_.refresh_frame_flags = 0xff;
+ } else {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ frame_header_.refresh_frame_flags = scratch;
+ // Section 6.8.2: If frame_type is equal to INTRA_ONLY_FRAME, it is a
+ // requirement of bitstream conformance that refresh_frame_flags is not
+ // equal to 0xff.
+ if (frame_header_.frame_type == kFrameIntraOnly &&
+ frame_header_.refresh_frame_flags == 0xff) {
+ LIBGAV1_DLOG(ERROR, "Intra only frames cannot have refresh flags 0xFF.");
+ return false;
+ }
+ }
+ if ((!IsIntraFrame(frame_header_.frame_type) ||
+ frame_header_.refresh_frame_flags != 0xff) &&
+ !ParseReferenceOrderHint()) {
+ return false;
+ }
+ if (IsIntraFrame(frame_header_.frame_type)) {
+ if (!ParseFrameSizeAndRenderSize()) return false;
+ if (frame_header_.allow_screen_content_tools &&
+ frame_header_.width == frame_header_.upscaled_width) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.allow_intrabc = static_cast<bool>(scratch);
+ }
+ } else {
+ if (!sequence_header_.enable_order_hint) {
+ frame_header_.frame_refs_short_signaling = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.frame_refs_short_signaling = static_cast<bool>(scratch);
+ if (frame_header_.frame_refs_short_signaling) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ const int8_t last_frame_idx = scratch;
+ OBU_READ_LITERAL_OR_FAIL(3);
+ const int8_t gold_frame_idx = scratch;
+ if (!SetFrameReferences(last_frame_idx, gold_frame_idx)) {
+ return false;
+ }
+ }
+ }
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ if (!frame_header_.frame_refs_short_signaling) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ frame_header_.reference_frame_index[i] = scratch;
+ }
+ const int reference_frame_index = frame_header_.reference_frame_index[i];
+ assert(reference_frame_index >= 0);
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // RefValid[ ref_frame_idx[ i ] ] is equal to 1 ...
+ // The remainder of the statement is handled by ParseSequenceHeader().
+ // Note if support for Annex C: Error resilience behavior is added this
+ // check should be omitted per C.5 Decoder consequences of processable
+ // frames.
+ if (!decoder_state_.reference_valid[reference_frame_index]) {
+ LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
+ reference_frame_index);
+ return false;
+ }
+ // Check if the inter frame requests a nonexistent reference, whether or
+ // not frame_refs_short_signaling is used.
+ if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
+ LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not a decoded frame.", i,
+ reference_frame_index);
+ return false;
+ }
+ if (sequence_header_.frame_id_numbers_present) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.delta_frame_id_length_bits);
+ const int delta_frame_id = static_cast<int>(1 + scratch);
+ const int id_length_max_value =
+ 1 << sequence_header_.frame_id_length_bits;
+ frame_header_.expected_frame_id[i] =
+ (frame_header_.current_frame_id + id_length_max_value -
+ delta_frame_id) %
+ id_length_max_value;
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // whenever expectedFrameId[ i ] is calculated, the value matches
+ // RefFrameId[ ref_frame_idx[ i ] ] ...
+ //
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // RefValid[ ref_frame_idx[ i ] ] is equal to 1, ...
+ if (frame_header_.expected_frame_id[i] !=
+ decoder_state_.reference_frame_id[reference_frame_index] ||
+ !decoder_state_.reference_valid[reference_frame_index]) {
+ LIBGAV1_DLOG(ERROR,
+ "Reference buffer %d has a frame id number mismatch.",
+ reference_frame_index);
+ return false;
+ }
+ }
+ }
+ if (frame_header_.frame_size_override_flag &&
+ !frame_header_.error_resilient_mode) {
+ // Section 5.9.7.
+ for (int index : frame_header_.reference_frame_index) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.found_reference = static_cast<bool>(scratch);
+ if (frame_header_.found_reference) {
+ const RefCountedBuffer* reference_frame =
+ decoder_state_.reference_frame[index].get();
+ // frame_header_.upscaled_width will be set in the
+ // ParseSuperResParametersAndComputeImageSize() call below.
+ frame_header_.width = reference_frame->upscaled_width();
+ frame_header_.height = reference_frame->frame_height();
+ frame_header_.render_width = reference_frame->render_width();
+ frame_header_.render_height = reference_frame->render_height();
+ if (!ParseSuperResParametersAndComputeImageSize()) return false;
+ break;
+ }
+ }
+ if (!frame_header_.found_reference && !ParseFrameSizeAndRenderSize()) {
+ return false;
+ }
+ } else {
+ if (!ParseFrameSizeAndRenderSize()) return false;
+ }
+ if (!ValidateInterFrameSize()) return false;
+ if (frame_header_.force_integer_mv != 0) {
+ frame_header_.allow_high_precision_mv = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.allow_high_precision_mv = static_cast<bool>(scratch);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ const auto is_filter_switchable = static_cast<bool>(scratch);
+ if (is_filter_switchable) {
+ frame_header_.interpolation_filter = kInterpolationFilterSwitchable;
+ } else {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ frame_header_.interpolation_filter =
+ static_cast<InterpolationFilter>(scratch);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.is_motion_mode_switchable = static_cast<bool>(scratch);
+ if (frame_header_.error_resilient_mode ||
+ !sequence_header_.enable_ref_frame_mvs) {
+ frame_header_.use_ref_frame_mvs = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.use_ref_frame_mvs = static_cast<bool>(scratch);
+ }
+ }
+ // At this point, we have parsed the frame and render sizes and computed
+ // the image size, whether it's an intra or inter frame. So we can save
+ // the sizes in the current frame now.
+ if (!current_frame_->SetFrameDimensions(frame_header_)) {
+ LIBGAV1_DLOG(ERROR, "Setting current frame dimensions failed.");
+ return false;
+ }
+ if (!IsIntraFrame(frame_header_.frame_type)) {
+ // Initialize the kReferenceFrameIntra type reference frame information to
+ // simplify the frame type validation in motion field projection.
+ // Set the kReferenceFrameIntra type |order_hint_| to
+ // |frame_header_.order_hint|. This guarantees that in SIMD implementations,
+ // the other reference frame information of the kReferenceFrameIntra type
+ // could be correctly initialized using the following loop with
+ // |frame_header_.order_hint| being the |hint|.
+ ReferenceInfo* const reference_info = current_frame_->reference_info();
+ reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint;
+ reference_info->relative_distance_from[kReferenceFrameIntra] = 0;
+ reference_info->relative_distance_to[kReferenceFrameIntra] = 0;
+ reference_info->skip_references[kReferenceFrameIntra] = true;
+ reference_info->projection_divisions[kReferenceFrameIntra] = 0;
+
+ for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) {
+ const auto reference_frame = static_cast<ReferenceFrameType>(i);
+ const uint8_t hint =
+ decoder_state_.reference_order_hint
+ [frame_header_.reference_frame_index[i - kReferenceFrameLast]];
+ reference_info->order_hint[reference_frame] = hint;
+ const int relative_distance_from =
+ GetRelativeDistance(hint, frame_header_.order_hint,
+ sequence_header_.order_hint_shift_bits);
+ const int relative_distance_to =
+ GetRelativeDistance(frame_header_.order_hint, hint,
+ sequence_header_.order_hint_shift_bits);
+ reference_info->relative_distance_from[reference_frame] =
+ relative_distance_from;
+ reference_info->relative_distance_to[reference_frame] =
+ relative_distance_to;
+ reference_info->skip_references[reference_frame] =
+ relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+ reference_info->projection_divisions[reference_frame] =
+ reference_info->skip_references[reference_frame]
+ ? 0
+ : kProjectionMvDivisionLookup[relative_distance_to];
+ decoder_state_.reference_frame_sign_bias[reference_frame] =
+ relative_distance_from > 0;
+ }
+ }
+ if (frame_header_.enable_cdf_update &&
+ !sequence_header_.reduced_still_picture_header) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.enable_frame_end_update_cdf = !static_cast<bool>(scratch);
+ } else {
+ frame_header_.enable_frame_end_update_cdf = false;
+ }
+ return true;
+}
+
+bool ObuParser::ParseFrameHeader() {
+ // Section 6.8.1: It is a requirement of bitstream conformance that a
+ // sequence header OBU has been received before a frame header OBU.
+ if (!has_sequence_header_) return false;
+ if (!ParseFrameParameters()) return false;
+ if (frame_header_.show_existing_frame) return true;
+ assert(!obu_headers_.empty());
+ current_frame_->set_spatial_id(obu_headers_.back().spatial_id);
+ current_frame_->set_temporal_id(obu_headers_.back().temporal_id);
+ bool status = ParseTileInfoSyntax() && ParseQuantizerParameters() &&
+ ParseSegmentationParameters();
+ if (!status) return false;
+ current_frame_->SetSegmentationParameters(frame_header_.segmentation);
+ status =
+ ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters();
+ if (!status) return false;
+ ComputeSegmentLosslessAndQIndex();
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // delta_q_present is equal to 0 when CodedLossless is equal to 1.
+ if (frame_header_.coded_lossless && frame_header_.delta_q.present) {
+ return false;
+ }
+ status = ParseLoopFilterParameters();
+ if (!status) return false;
+ current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter);
+ status = ParseCdefParameters() && ParseLoopRestorationParameters() &&
+ ParseTxModeSyntax() && ParseFrameReferenceModeSyntax() &&
+ ParseSkipModeParameters() && ReadAllowWarpedMotion();
+ if (!status) return false;
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.reduced_tx_set = static_cast<bool>(scratch);
+ status = ParseGlobalMotionParameters();
+ if (!status) return false;
+ current_frame_->SetGlobalMotions(frame_header_.global_motion);
+ status = ParseFilmGrainParameters();
+ if (!status) return false;
+ if (sequence_header_.film_grain_params_present) {
+ current_frame_->set_film_grain_params(frame_header_.film_grain_params);
+ }
+ return true;
+}
+
+bool ObuParser::ParsePadding(const uint8_t* data, size_t size) {
+ // The spec allows a padding OBU to be header-only (i.e., |size| = 0). So
+ // check trailing bits only if |size| > 0.
+ if (size == 0) return true;
+ // The payload of a padding OBU is byte aligned. Therefore the first
+ // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
+ const int i = GetLastNonzeroByteIndex(data, size);
+ if (i < 0) {
+ LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+ return false;
+ }
+ if (data[i] != 0x80) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "The last nonzero byte of the payload data is 0x%x, should be 0x80.",
+ data[i]);
+ return false;
+ }
+ // Skip all bits before the trailing bit.
+ bit_reader_->SkipBytes(i);
+ return true;
+}
+
+bool ObuParser::ParseMetadataScalability() {
+ int64_t scratch;
+ // scalability_mode_idc
+ OBU_READ_LITERAL_OR_FAIL(8);
+ const auto scalability_mode_idc = static_cast<int>(scratch);
+ if (scalability_mode_idc == kScalabilitySS) {
+ // Parse scalability_structure().
+ // spatial_layers_cnt_minus_1
+ OBU_READ_LITERAL_OR_FAIL(2);
+ const auto spatial_layers_count = static_cast<int>(scratch) + 1;
+ // spatial_layer_dimensions_present_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto spatial_layer_dimensions_present_flag =
+ static_cast<bool>(scratch);
+ // spatial_layer_description_present_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto spatial_layer_description_present_flag =
+ static_cast<bool>(scratch);
+ // temporal_group_description_present_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto temporal_group_description_present_flag =
+ static_cast<bool>(scratch);
+ // scalability_structure_reserved_3bits
+ OBU_READ_LITERAL_OR_FAIL(3);
+ if (scratch != 0) {
+ LIBGAV1_DLOG(WARNING,
+ "scalability_structure_reserved_3bits is not zero.");
+ }
+ if (spatial_layer_dimensions_present_flag) {
+ for (int i = 0; i < spatial_layers_count; ++i) {
+ // spatial_layer_max_width[i]
+ OBU_READ_LITERAL_OR_FAIL(16);
+ // spatial_layer_max_height[i]
+ OBU_READ_LITERAL_OR_FAIL(16);
+ }
+ }
+ if (spatial_layer_description_present_flag) {
+ for (int i = 0; i < spatial_layers_count; ++i) {
+ // spatial_layer_ref_id[i]
+ OBU_READ_LITERAL_OR_FAIL(8);
+ }
+ }
+ if (temporal_group_description_present_flag) {
+ // temporal_group_size
+ OBU_READ_LITERAL_OR_FAIL(8);
+ const auto temporal_group_size = static_cast<int>(scratch);
+ for (int i = 0; i < temporal_group_size; ++i) {
+ // temporal_group_temporal_id[i]
+ OBU_READ_LITERAL_OR_FAIL(3);
+ // temporal_group_temporal_switching_up_point_flag[i]
+ OBU_READ_BIT_OR_FAIL;
+ // temporal_group_spatial_switching_up_point_flag[i]
+ OBU_READ_BIT_OR_FAIL;
+ // temporal_group_ref_cnt[i]
+ OBU_READ_LITERAL_OR_FAIL(3);
+ const auto temporal_group_ref_count = static_cast<int>(scratch);
+ for (int j = 0; j < temporal_group_ref_count; ++j) {
+ // temporal_group_ref_pic_diff[i][j]
+ OBU_READ_LITERAL_OR_FAIL(8);
+ }
+ }
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseMetadataTimecode() {
+ int64_t scratch;
+ // counting_type: should be the same for all pictures in the coded video
+ // sequence. 7..31 are reserved.
+ OBU_READ_LITERAL_OR_FAIL(5);
+ // full_timestamp_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto full_timestamp_flag = static_cast<bool>(scratch);
+ // discontinuity_flag
+ OBU_READ_BIT_OR_FAIL;
+ // cnt_dropped_flag
+ OBU_READ_BIT_OR_FAIL;
+ // n_frames
+ OBU_READ_LITERAL_OR_FAIL(9);
+ if (full_timestamp_flag) {
+ // seconds_value
+ OBU_READ_LITERAL_OR_FAIL(6);
+ const auto seconds_value = static_cast<int>(scratch);
+ if (seconds_value > 59) {
+ LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+ return false;
+ }
+ // minutes_value
+ OBU_READ_LITERAL_OR_FAIL(6);
+ const auto minutes_value = static_cast<int>(scratch);
+ if (minutes_value > 59) {
+ LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+ return false;
+ }
+ // hours_value
+ OBU_READ_LITERAL_OR_FAIL(5);
+ const auto hours_value = static_cast<int>(scratch);
+ if (hours_value > 23) {
+ LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+ return false;
+ }
+ } else {
+ // seconds_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto seconds_flag = static_cast<bool>(scratch);
+ if (seconds_flag) {
+ // seconds_value
+ OBU_READ_LITERAL_OR_FAIL(6);
+ const auto seconds_value = static_cast<int>(scratch);
+ if (seconds_value > 59) {
+ LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+ return false;
+ }
+ // minutes_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto minutes_flag = static_cast<bool>(scratch);
+ if (minutes_flag) {
+ // minutes_value
+ OBU_READ_LITERAL_OR_FAIL(6);
+ const auto minutes_value = static_cast<int>(scratch);
+ if (minutes_value > 59) {
+ LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+ return false;
+ }
+ // hours_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto hours_flag = static_cast<bool>(scratch);
+ if (hours_flag) {
+ // hours_value
+ OBU_READ_LITERAL_OR_FAIL(5);
+ const auto hours_value = static_cast<int>(scratch);
+ if (hours_value > 23) {
+ LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+ return false;
+ }
+ }
+ }
+ }
+ }
+ // time_offset_length: should be the same for all pictures in the coded
+ // video sequence.
+ OBU_READ_LITERAL_OR_FAIL(5);
+ const auto time_offset_length = static_cast<int>(scratch);
+ if (time_offset_length > 0) {
+ // time_offset_value
+ OBU_READ_LITERAL_OR_FAIL(time_offset_length);
+ }
+ // Compute clockTimestamp. Section 6.7.7:
+ // When timing_info_present_flag is equal to 1 and discontinuity_flag is
+ // equal to 0, the value of clockTimestamp shall be greater than or equal
+ // to the value of clockTimestamp for the previous set of clock timestamp
+ // syntax elements in output order.
+ return true;
+}
+
+bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
+ const size_t start_offset = bit_reader_->byte_offset();
+ size_t metadata_type;
+ if (!bit_reader_->ReadUnsignedLeb128(&metadata_type)) {
+ LIBGAV1_DLOG(ERROR, "Could not read metadata_type.");
+ return false;
+ }
+ const size_t metadata_type_size = bit_reader_->byte_offset() - start_offset;
+ if (size < metadata_type_size) {
+ LIBGAV1_DLOG(
+ ERROR, "metadata_type is longer than metadata OBU payload %zu vs %zu.",
+ metadata_type_size, size);
+ return false;
+ }
+ data += metadata_type_size;
+ size -= metadata_type_size;
+ int64_t scratch;
+ switch (metadata_type) {
+ case kMetadataTypeHdrContentLightLevel:
+ OBU_READ_LITERAL_OR_FAIL(16);
+ metadata_.max_cll = scratch;
+ OBU_READ_LITERAL_OR_FAIL(16);
+ metadata_.max_fall = scratch;
+ break;
+ case kMetadataTypeHdrMasteringDisplayColorVolume:
+ for (int i = 0; i < 3; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(16);
+ metadata_.primary_chromaticity_x[i] = scratch;
+ OBU_READ_LITERAL_OR_FAIL(16);
+ metadata_.primary_chromaticity_y[i] = scratch;
+ }
+ OBU_READ_LITERAL_OR_FAIL(16);
+ metadata_.white_point_chromaticity_x = scratch;
+ OBU_READ_LITERAL_OR_FAIL(16);
+ metadata_.white_point_chromaticity_y = scratch;
+ OBU_READ_LITERAL_OR_FAIL(32);
+ metadata_.luminance_max = static_cast<uint32_t>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(32);
+ metadata_.luminance_min = static_cast<uint32_t>(scratch);
+ break;
+ case kMetadataTypeScalability:
+ if (!ParseMetadataScalability()) return false;
+ break;
+ case kMetadataTypeItutT35: {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ metadata_.itu_t_t35_country_code = static_cast<uint8_t>(scratch);
+ ++data;
+ --size;
+ if (metadata_.itu_t_t35_country_code == 0xFF) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ metadata_.itu_t_t35_country_code_extension_byte =
+ static_cast<uint8_t>(scratch);
+ ++data;
+ --size;
+ }
+ // Read itu_t_t35_payload_bytes. Section 6.7.2 of the spec says:
+ // itu_t_t35_payload_bytes shall be bytes containing data registered as
+ // specified in Recommendation ITU-T T.35.
+ // Therefore itu_t_t35_payload_bytes is byte aligned and the first
+ // trailing byte should be 0x80. Since the exact syntax of
+ // itu_t_t35_payload_bytes is not defined in the AV1 spec, identify the
+ // end of itu_t_t35_payload_bytes by searching for the trailing bit.
+ const int i = GetLastNonzeroByteIndex(data, size);
+ if (i < 0) {
+ LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+ return false;
+ }
+ if (data[i] != 0x80) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "itu_t_t35_payload_bytes is not byte aligned. The last nonzero "
+ "byte of the payload data is 0x%x, should be 0x80.",
+ data[i]);
+ return false;
+ }
+ if (i != 0) {
+ // data[0]..data[i - 1] are itu_t_t35_payload_bytes.
+ metadata_.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[i]);
+ if (metadata_.itu_t_t35_payload_bytes == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Allocation of itu_t_t35_payload_bytes failed.");
+ return false;
+ }
+ memcpy(metadata_.itu_t_t35_payload_bytes.get(), data, i);
+ metadata_.itu_t_t35_payload_size = i;
+ }
+ // Skip all bits before the trailing bit.
+ bit_reader_->SkipBytes(i);
+ break;
+ }
+ case kMetadataTypeTimecode:
+ if (!ParseMetadataTimecode()) return false;
+ break;
+ default: {
+ // metadata_type is equal to a value reserved for future use or a user
+ // private value.
+ //
+ // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU
+ // if they do not understand the metadata_type." Find the trailing bit
+ // and skip all bits before the trailing bit.
+ const int i = GetLastNonzeroByteIndex(data, size);
+ if (i >= 0) {
+ // The last 1 bit in the last nonzero byte is the trailing bit. Skip
+ // all bits before the trailing bit.
+ const int n = CountTrailingZeros(data[i]);
+ bit_reader_->SkipBits(i * 8 + 7 - n);
+ }
+ break;
+ }
+ }
+ return true;
+}
+
+bool ObuParser::AddTileBuffers(int start, int end, size_t total_size,
+ size_t tg_header_size,
+ size_t bytes_consumed_so_far) {
+ // Validate that the tile group start and end are within the allowed range.
+ if (start != next_tile_group_start_ || start > end ||
+ end >= frame_header_.tile_info.tile_count) {
+ LIBGAV1_DLOG(ERROR,
+ "Invalid tile group start %d or end %d: expected tile group "
+ "start %d, tile_count %d.",
+ start, end, next_tile_group_start_,
+ frame_header_.tile_info.tile_count);
+ return false;
+ }
+ next_tile_group_start_ = end + 1;
+
+ if (total_size < tg_header_size) {
+ LIBGAV1_DLOG(ERROR, "total_size (%zu) is less than tg_header_size (%zu).)",
+ total_size, tg_header_size);
+ return false;
+ }
+ size_t bytes_left = total_size - tg_header_size;
+ const uint8_t* data = data_ + bytes_consumed_so_far + tg_header_size;
+ for (int tile_number = start; tile_number <= end; ++tile_number) {
+ size_t tile_size = 0;
+ if (tile_number != end) {
+ RawBitReader bit_reader(data, bytes_left);
+ if (!bit_reader.ReadLittleEndian(frame_header_.tile_info.tile_size_bytes,
+ &tile_size)) {
+ LIBGAV1_DLOG(ERROR, "Could not read tile size for tile #%d",
+ tile_number);
+ return false;
+ }
+ ++tile_size;
+ data += frame_header_.tile_info.tile_size_bytes;
+ bytes_left -= frame_header_.tile_info.tile_size_bytes;
+ if (tile_size > bytes_left) {
+ LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+ tile_number);
+ return false;
+ }
+ } else {
+ tile_size = bytes_left;
+ if (tile_size == 0) {
+ LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+ tile_number);
+ return false;
+ }
+ }
+ // The memory for this has been allocated in ParseTileInfoSyntax(). So it is
+ // safe to use push_back_unchecked here.
+ tile_buffers_.push_back_unchecked({data, tile_size});
+ data += tile_size;
+ bytes_left -= tile_size;
+ }
+ bit_reader_->SkipBytes(total_size - tg_header_size);
+ return true;
+}
+
+bool ObuParser::ParseTileGroup(size_t size, size_t bytes_consumed_so_far) {
+ const TileInfo* const tile_info = &frame_header_.tile_info;
+ const size_t start_offset = bit_reader_->byte_offset();
+ const int tile_bits =
+ tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+ if (tile_bits == 0) {
+ return AddTileBuffers(0, 0, size, 0, bytes_consumed_so_far);
+ }
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ const auto tile_start_and_end_present_flag = static_cast<bool>(scratch);
+ if (!tile_start_and_end_present_flag) {
+ if (!bit_reader_->AlignToNextByte()) {
+ LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+ return false;
+ }
+ return AddTileBuffers(0, tile_info->tile_count - 1, size, 1,
+ bytes_consumed_so_far);
+ }
+ if (obu_headers_.back().type == kObuFrame) {
+ // 6.10.1: If obu_type is equal to OBU_FRAME, it is a requirement of
+ // bitstream conformance that the value of tile_start_and_end_present_flag
+ // is equal to 0.
+ LIBGAV1_DLOG(ERROR,
+ "tile_start_and_end_present_flag must be 0 in Frame OBU");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(tile_bits);
+ const int start = static_cast<int>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(tile_bits);
+ const int end = static_cast<int>(scratch);
+ if (!bit_reader_->AlignToNextByte()) {
+ LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+ return false;
+ }
+ const size_t tg_header_size = bit_reader_->byte_offset() - start_offset;
+ return AddTileBuffers(start, end, size, tg_header_size,
+ bytes_consumed_so_far);
+}
+
+bool ObuParser::ParseHeader() {
+ ObuHeader obu_header;
+ int64_t scratch = bit_reader_->ReadBit();
+ if (scratch != 0) {
+ LIBGAV1_DLOG(ERROR, "forbidden_bit is not zero.");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(4);
+ obu_header.type = static_cast<libgav1::ObuType>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ const auto extension_flag = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ obu_header.has_size_field = static_cast<bool>(scratch);
+ OBU_READ_BIT_OR_FAIL; // reserved.
+ if (scratch != 0) {
+ LIBGAV1_DLOG(WARNING, "obu_reserved_1bit is not zero.");
+ }
+ obu_header.has_extension = extension_flag;
+ if (extension_flag) {
+ if (extension_disallowed_) {
+ LIBGAV1_DLOG(ERROR,
+ "OperatingPointIdc is 0, but obu_extension_flag is 1.");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(3);
+ obu_header.temporal_id = scratch;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ obu_header.spatial_id = scratch;
+ OBU_READ_LITERAL_OR_FAIL(3); // reserved.
+ if (scratch != 0) {
+ LIBGAV1_DLOG(WARNING, "extension_header_reserved_3bits is not zero.");
+ }
+ } else {
+ obu_header.temporal_id = 0;
+ obu_header.spatial_id = 0;
+ }
+ return obu_headers_.push_back(obu_header);
+}
+
+#undef OBU_READ_UVLC_OR_FAIL
+#undef OBU_READ_LITERAL_OR_FAIL
+#undef OBU_READ_BIT_OR_FAIL
+#undef OBU_PARSER_FAIL
+#undef OBU_LOG_AND_RETURN_FALSE
+
+bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) {
+ bit_reader_.reset(new (std::nothrow) RawBitReader(data, size));
+ return bit_reader_ != nullptr;
+}
+
+bool ObuParser::HasData() const { return size_ > 0; }
+
+StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
+ if (data_ == nullptr || size_ == 0) return kStatusInvalidArgument;
+
+ assert(current_frame_ == nullptr);
+ // This is used to release any references held in case of parsing failure.
+ RefCountedBufferPtrCleanup current_frame_cleanup(&current_frame_);
+
+ const uint8_t* data = data_;
+ size_t size = size_;
+
+ // Clear everything except the sequence header.
+ obu_headers_.clear();
+ frame_header_ = {};
+ metadata_ = {};
+ tile_buffers_.clear();
+ next_tile_group_start_ = 0;
+
+ bool parsed_one_full_frame = false;
+ bool seen_frame_header = false;
+ const uint8_t* frame_header = nullptr;
+ size_t frame_header_size_in_bits = 0;
+ while (size > 0 && !parsed_one_full_frame) {
+ if (!InitBitReader(data, size)) {
+ LIBGAV1_DLOG(ERROR, "Failed to initialize bit reader.");
+ return kStatusOutOfMemory;
+ }
+ if (!ParseHeader()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU Header.");
+ return kStatusBitstreamError;
+ }
+ const ObuHeader& obu_header = obu_headers_.back();
+ if (!obu_header.has_size_field) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "has_size_field is zero. libgav1 does not support such streams.");
+ return kStatusUnimplemented;
+ }
+ const size_t obu_header_size = bit_reader_->byte_offset();
+ size_t obu_size;
+ if (!bit_reader_->ReadUnsignedLeb128(&obu_size)) {
+ LIBGAV1_DLOG(ERROR, "Could not read OBU size.");
+ return kStatusBitstreamError;
+ }
+ const size_t obu_length_size = bit_reader_->byte_offset() - obu_header_size;
+ if (size - bit_reader_->byte_offset() < obu_size) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits left to parse OBU %zu vs %zu.",
+ size - bit_reader_->bit_offset(), obu_size);
+ return kStatusBitstreamError;
+ }
+
+ const ObuType obu_type = obu_header.type;
+ if (obu_type != kObuSequenceHeader && obu_type != kObuTemporalDelimiter &&
+ has_sequence_header_ &&
+ sequence_header_.operating_point_idc[operating_point_] != 0 &&
+ obu_header.has_extension &&
+ (!InTemporalLayer(
+ sequence_header_.operating_point_idc[operating_point_],
+ obu_header.temporal_id) ||
+ !InSpatialLayer(sequence_header_.operating_point_idc[operating_point_],
+ obu_header.spatial_id))) {
+ obu_headers_.pop_back();
+ bit_reader_->SkipBytes(obu_size);
+ data += bit_reader_->byte_offset();
+ size -= bit_reader_->byte_offset();
+ continue;
+ }
+
+ const size_t obu_start_position = bit_reader_->bit_offset();
+ // The bit_reader_ is byte aligned after reading obu_header and obu_size.
+ // Therefore the byte offset can be computed as obu_start_position >> 3
+ // below.
+ assert((obu_start_position & 7) == 0);
+ bool obu_skipped = false;
+ switch (obu_type) {
+ case kObuTemporalDelimiter:
+ break;
+ case kObuSequenceHeader:
+ if (!ParseSequenceHeader(seen_frame_header)) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse SequenceHeader OBU.");
+ return kStatusBitstreamError;
+ }
+ if (sequence_header_.color_config.bitdepth > LIBGAV1_MAX_BITDEPTH) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Bitdepth %d is not supported. The maximum bitdepth is %d.",
+ sequence_header_.color_config.bitdepth, LIBGAV1_MAX_BITDEPTH);
+ return kStatusUnimplemented;
+ }
+ break;
+ case kObuFrameHeader:
+ if (seen_frame_header) {
+ LIBGAV1_DLOG(ERROR,
+ "Frame header found but frame header was already seen.");
+ return kStatusBitstreamError;
+ }
+ if (!ParseFrameHeader()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader OBU.");
+ return kStatusBitstreamError;
+ }
+ frame_header = &data[obu_start_position >> 3];
+ frame_header_size_in_bits =
+ bit_reader_->bit_offset() - obu_start_position;
+ seen_frame_header = true;
+ parsed_one_full_frame = frame_header_.show_existing_frame;
+ break;
+ case kObuRedundantFrameHeader: {
+ if (!seen_frame_header) {
+ LIBGAV1_DLOG(ERROR,
+ "Redundant frame header found but frame header was not "
+ "yet seen.");
+ return kStatusBitstreamError;
+ }
+ const size_t fh_size = (frame_header_size_in_bits + 7) >> 3;
+ if (obu_size < fh_size ||
+ memcmp(frame_header, &data[obu_start_position >> 3], fh_size) !=
+ 0) {
+ LIBGAV1_DLOG(ERROR,
+ "Redundant frame header differs from frame header.");
+ return kStatusBitstreamError;
+ }
+ bit_reader_->SkipBits(frame_header_size_in_bits);
+ break;
+ }
+ case kObuFrame: {
+ const size_t fh_start_offset = bit_reader_->byte_offset();
+ if (seen_frame_header) {
+ LIBGAV1_DLOG(ERROR,
+ "Frame header found but frame header was already seen.");
+ return kStatusBitstreamError;
+ }
+ if (!ParseFrameHeader()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader in Frame OBU.");
+ return kStatusBitstreamError;
+ }
+ // Section 6.8.2: If obu_type is equal to OBU_FRAME, it is a
+ // requirement of bitstream conformance that show_existing_frame is
+ // equal to 0.
+ if (frame_header_.show_existing_frame) {
+ LIBGAV1_DLOG(ERROR, "Frame OBU cannot set show_existing_frame to 1.");
+ return kStatusBitstreamError;
+ }
+ if (!bit_reader_->AlignToNextByte()) {
+ LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+ return kStatusBitstreamError;
+ }
+ const size_t fh_size = bit_reader_->byte_offset() - fh_start_offset;
+ if (fh_size >= obu_size) {
+ LIBGAV1_DLOG(ERROR, "Frame header size (%zu) >= obu_size (%zu).",
+ fh_size, obu_size);
+ return kStatusBitstreamError;
+ }
+ if (!ParseTileGroup(obu_size - fh_size,
+ size_ - size + bit_reader_->byte_offset())) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup in Frame OBU.");
+ return kStatusBitstreamError;
+ }
+ parsed_one_full_frame = true;
+ break;
+ }
+ case kObuTileGroup:
+ if (!ParseTileGroup(obu_size,
+ size_ - size + bit_reader_->byte_offset())) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup OBU.");
+ return kStatusBitstreamError;
+ }
+ parsed_one_full_frame =
+ (next_tile_group_start_ == frame_header_.tile_info.tile_count);
+ break;
+ case kObuTileList:
+ LIBGAV1_DLOG(ERROR, "Decoding of tile list OBUs is not supported.");
+ return kStatusUnimplemented;
+ case kObuPadding:
+ if (!ParsePadding(&data[obu_start_position >> 3], obu_size)) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse Padding OBU.");
+ return kStatusBitstreamError;
+ }
+ break;
+ case kObuMetadata:
+ if (!ParseMetadata(&data[obu_start_position >> 3], obu_size)) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse Metadata OBU.");
+ return kStatusBitstreamError;
+ }
+ break;
+ default:
+ // Skip reserved OBUs. Section 6.2.2: Reserved units are for future use
+ // and shall be ignored by AV1 decoder.
+ bit_reader_->SkipBytes(obu_size);
+ obu_skipped = true;
+ break;
+ }
+ if (obu_size > 0 && !obu_skipped && obu_type != kObuFrame &&
+ obu_type != kObuTileGroup) {
+ const size_t parsed_obu_size_in_bits =
+ bit_reader_->bit_offset() - obu_start_position;
+ if (obu_size * 8 < parsed_obu_size_in_bits) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Parsed OBU size (%zu bits) is greater than expected OBU size "
+ "(%zu bytes) obu_type: %d.",
+ parsed_obu_size_in_bits, obu_size, obu_type);
+ return kStatusBitstreamError;
+ }
+ if (!bit_reader_->VerifyAndSkipTrailingBits(obu_size * 8 -
+ parsed_obu_size_in_bits)) {
+ LIBGAV1_DLOG(ERROR,
+ "Error when verifying trailing bits for obu type: %d",
+ obu_type);
+ return kStatusBitstreamError;
+ }
+ }
+ const size_t bytes_consumed = bit_reader_->byte_offset();
+ const size_t consumed_obu_size =
+ bytes_consumed - obu_length_size - obu_header_size;
+ if (consumed_obu_size != obu_size) {
+ LIBGAV1_DLOG(ERROR,
+ "OBU size (%zu) and consumed size (%zu) does not match for "
+ "obu_type: %d.",
+ obu_size, consumed_obu_size, obu_type);
+ return kStatusBitstreamError;
+ }
+ data += bytes_consumed;
+ size -= bytes_consumed;
+ }
+ if (!parsed_one_full_frame && seen_frame_header) {
+ LIBGAV1_DLOG(ERROR, "The last tile group in the frame was not received.");
+ return kStatusBitstreamError;
+ }
+ data_ = data;
+ size_ = size;
+ *current_frame = std::move(current_frame_);
+ return kStatusOk;
+}
+
+} // namespace libgav1
diff --git a/src/obu_parser.h b/src/obu_parser.h
new file mode 100644
index 0000000..86d165f
--- /dev/null
+++ b/src/obu_parser.h
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_OBU_PARSER_H_
+#define LIBGAV1_SRC_OBU_PARSER_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/status_code.h"
+#include "src/quantizer.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// structs and enums related to Open Bitstream Units (OBU).
+
+enum {
+ kMinimumMajorBitstreamLevel = 2,
+ kSelectScreenContentTools = 2,
+ kSelectIntegerMv = 2,
+ kLoopRestorationTileSizeMax = 256,
+ kGlobalMotionAlphaBits = 12,
+ kGlobalMotionTranslationBits = 12,
+ kGlobalMotionTranslationOnlyBits = 9,
+ kGlobalMotionAlphaPrecisionBits = 15,
+ kGlobalMotionTranslationPrecisionBits = 6,
+ kGlobalMotionTranslationOnlyPrecisionBits = 3,
+ kMaxTileWidth = 4096,
+ kMaxTileArea = 4096 * 2304,
+ kPrimaryReferenceNone = 7,
+ // A special value of the scalability_mode_idc syntax element that indicates
+ // the picture prediction structure is specified in scalability_structure().
+ kScalabilitySS = 14
+}; // anonymous enum
+
+struct ObuHeader {
+ ObuType type;
+ bool has_extension;
+ bool has_size_field;
+ int8_t temporal_id;
+ int8_t spatial_id;
+};
+
+enum BitstreamProfile : uint8_t {
+ kProfile0,
+ kProfile1,
+ kProfile2,
+ kMaxProfiles
+};
+
+// In the bitstream the level is encoded in five bits: the first three bits
+// encode |major| - 2 and the last two bits encode |minor|.
+//
+// If the mapped level (major.minor) is in the tables in Annex A.3, there are
+// bitstream conformance requirements on the maximum or minimum values of
+// several variables. The encoded value of 31 (which corresponds to the mapped
+// level 9.3) is the "maximum parameters" level and imposes no level-based
+// constraints on the bitstream.
+struct BitStreamLevel {
+ uint8_t major; // Range: 2-9.
+ uint8_t minor; // Range: 0-3.
+};
+
+struct ColorConfig {
+ int8_t bitdepth;
+ bool is_monochrome;
+ ColorPrimary color_primary;
+ TransferCharacteristics transfer_characteristics;
+ MatrixCoefficients matrix_coefficients;
+ // A binary value (0 or 1) that is associated with the VideoFullRangeFlag
+ // variable specified in ISO/IEC 23091-4/ITUT H.273.
+ // * 0: the studio swing representation.
+ // * 1: the full swing representation.
+ ColorRange color_range;
+ int8_t subsampling_x;
+ int8_t subsampling_y;
+ ChromaSamplePosition chroma_sample_position;
+ bool separate_uv_delta_q;
+};
+
+struct TimingInfo {
+ uint32_t num_units_in_tick;
+ uint32_t time_scale;
+ bool equal_picture_interval;
+ uint32_t num_ticks_per_picture;
+};
+
+struct DecoderModelInfo {
+ uint8_t encoder_decoder_buffer_delay_length;
+ uint32_t num_units_in_decoding_tick;
+ uint8_t buffer_removal_time_length;
+ uint8_t frame_presentation_time_length;
+};
+
+struct OperatingParameters {
+ uint32_t decoder_buffer_delay[kMaxOperatingPoints];
+ uint32_t encoder_buffer_delay[kMaxOperatingPoints];
+ bool low_delay_mode_flag[kMaxOperatingPoints];
+};
+
+struct ObuSequenceHeader {
+ // Section 7.5:
+ // Within a particular coded video sequence, the contents of
+ // sequence_header_obu must be bit-identical each time the sequence header
+ // appears except for the contents of operating_parameters_info. A new
+ // coded video sequence is required if the sequence header parameters
+ // change.
+ //
+ // IMPORTANT: ParametersChanged() is implemented with a memcmp() call. For
+ // this to work, this object and the |old| object must be initialized with
+ // an empty brace-enclosed list, which initializes any padding to zero bits.
+ // See https://en.cppreference.com/w/cpp/language/zero_initialization.
+ bool ParametersChanged(const ObuSequenceHeader& old) const;
+
+ BitstreamProfile profile;
+ bool still_picture;
+ bool reduced_still_picture_header;
+ int operating_points;
+ int operating_point_idc[kMaxOperatingPoints];
+ BitStreamLevel level[kMaxOperatingPoints];
+ int8_t tier[kMaxOperatingPoints];
+ int8_t frame_width_bits;
+ int8_t frame_height_bits;
+ int32_t max_frame_width;
+ int32_t max_frame_height;
+ bool frame_id_numbers_present;
+ int8_t frame_id_length_bits;
+ int8_t delta_frame_id_length_bits;
+ bool use_128x128_superblock;
+ bool enable_filter_intra;
+ bool enable_intra_edge_filter;
+ bool enable_interintra_compound;
+ bool enable_masked_compound;
+ bool enable_warped_motion;
+ bool enable_dual_filter;
+ bool enable_order_hint;
+ // If enable_order_hint is true, order_hint_bits is in the range [1, 8].
+ // If enable_order_hint is false, order_hint_bits is 0.
+ int8_t order_hint_bits;
+ // order_hint_shift_bits equals (32 - order_hint_bits) % 32.
+ // This is used frequently in GetRelativeDistance().
+ uint8_t order_hint_shift_bits;
+ bool enable_jnt_comp;
+ bool enable_ref_frame_mvs;
+ bool choose_screen_content_tools;
+ int8_t force_screen_content_tools;
+ bool choose_integer_mv;
+ int8_t force_integer_mv;
+ bool enable_superres;
+ bool enable_cdef;
+ bool enable_restoration;
+ ColorConfig color_config;
+ bool timing_info_present_flag;
+ TimingInfo timing_info;
+ bool decoder_model_info_present_flag;
+ DecoderModelInfo decoder_model_info;
+ bool decoder_model_present_for_operating_point[kMaxOperatingPoints];
+ bool initial_display_delay_present_flag;
+ uint8_t initial_display_delay[kMaxOperatingPoints];
+ bool film_grain_params_present;
+
+ // IMPORTANT: the operating_parameters member must be at the end of the
+ // struct so that ParametersChanged() can be implemented with a memcmp()
+ // call.
+ OperatingParameters operating_parameters;
+};
+// Verify it is safe to use offsetof with ObuSequenceHeader and to use memcmp
+// to compare two ObuSequenceHeader objects.
+static_assert(std::is_standard_layout<ObuSequenceHeader>::value, "");
+// Verify operating_parameters is the last member of ObuSequenceHeader. The
+// second assertion assumes that ObuSequenceHeader has no padding after the
+// operating_parameters field. The first assertion is a sufficient condition
+// for ObuSequenceHeader to have no padding after the operating_parameters
+// field.
+static_assert(alignof(ObuSequenceHeader) == alignof(OperatingParameters), "");
+static_assert(sizeof(ObuSequenceHeader) ==
+ offsetof(ObuSequenceHeader, operating_parameters) +
+ sizeof(OperatingParameters),
+ "");
+
+struct TileBuffer {
+ const uint8_t* data;
+ size_t size;
+};
+
+enum MetadataType : uint8_t {
+ // 0 is reserved for AOM use.
+ kMetadataTypeHdrContentLightLevel = 1,
+ kMetadataTypeHdrMasteringDisplayColorVolume = 2,
+ kMetadataTypeScalability = 3,
+ kMetadataTypeItutT35 = 4,
+ kMetadataTypeTimecode = 5,
+ // 6-31 are unregistered user private.
+ // 32 and greater are reserved for AOM use.
+};
+
+struct ObuMetadata {
+ // Maximum content light level.
+ uint16_t max_cll;
+ // Maximum frame-average light level.
+ uint16_t max_fall;
+ uint16_t primary_chromaticity_x[3];
+ uint16_t primary_chromaticity_y[3];
+ uint16_t white_point_chromaticity_x;
+ uint16_t white_point_chromaticity_y;
+ uint32_t luminance_max;
+ uint32_t luminance_min;
+ // ITU-T T.35.
+ uint8_t itu_t_t35_country_code;
+ uint8_t itu_t_t35_country_code_extension_byte; // Valid if
+ // itu_t_t35_country_code is
+ // 0xFF.
+ std::unique_ptr<uint8_t[]> itu_t_t35_payload_bytes;
+ size_t itu_t_t35_payload_size;
+};
+
+class ObuParser : public Allocable {
+ public:
+ ObuParser(const uint8_t* const data, size_t size, int operating_point,
+ BufferPool* const buffer_pool, DecoderState* const decoder_state)
+ : data_(data),
+ size_(size),
+ operating_point_(operating_point),
+ buffer_pool_(buffer_pool),
+ decoder_state_(*decoder_state) {}
+
+ // Not copyable or movable.
+ ObuParser(const ObuParser& rhs) = delete;
+ ObuParser& operator=(const ObuParser& rhs) = delete;
+
+ // Returns true if there is more data that needs to be parsed.
+ bool HasData() const;
+
+ // Parses a sequence of Open Bitstream Units until a decodable frame is found
+ // (or until the end of stream is reached). A decodable frame is considered to
+ // be found when one of the following happens:
+ // * A kObuFrame is seen.
+ // * The kObuTileGroup containing the last tile is seen.
+ // * A kFrameHeader with show_existing_frame = true is seen.
+ //
+ // If the parsing is successful, relevant fields will be populated. The fields
+ // are valid only if the return value is kStatusOk. Returns kStatusOk on
+ // success, an error status otherwise. On success, |current_frame| will be
+ // populated with a valid frame buffer.
+ StatusCode ParseOneFrame(RefCountedBufferPtr* current_frame);
+
+ // Getters. Only valid if ParseOneFrame() completes successfully.
+ const Vector<ObuHeader>& obu_headers() const { return obu_headers_; }
+ const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+ const ObuFrameHeader& frame_header() const { return frame_header_; }
+ const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
+ const ObuMetadata& metadata() const { return metadata_; }
+
+ // Setters.
+ void set_sequence_header(const ObuSequenceHeader& sequence_header) {
+ sequence_header_ = sequence_header;
+ has_sequence_header_ = true;
+ }
+
+ // Moves |tile_buffers_| into |tile_buffers|.
+ void MoveTileBuffers(Vector<TileBuffer>* tile_buffers) {
+ *tile_buffers = std::move(tile_buffers_);
+ }
+
+ private:
+ // Initializes the bit reader. This is a function of its own to make unit
+ // testing of private functions simpler.
+ LIBGAV1_MUST_USE_RESULT bool InitBitReader(const uint8_t* data, size_t size);
+
+ // Parse helper functions.
+ bool ParseHeader(); // 5.3.2 and 5.3.3.
+ bool ParseColorConfig(ObuSequenceHeader* sequence_header); // 5.5.2.
+ bool ParseTimingInfo(ObuSequenceHeader* sequence_header); // 5.5.3.
+ bool ParseDecoderModelInfo(ObuSequenceHeader* sequence_header); // 5.5.4.
+ bool ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+ int index); // 5.5.5.
+ bool ParseSequenceHeader(bool seen_frame_header); // 5.5.1.
+ bool ParseFrameParameters(); // 5.9.2, 5.9.7 and 5.9.10.
+ void MarkInvalidReferenceFrames(); // 5.9.4.
+ bool ParseFrameSizeAndRenderSize(); // 5.9.5 and 5.9.6.
+ bool ParseSuperResParametersAndComputeImageSize(); // 5.9.8 and 5.9.9.
+ // Checks the bitstream conformance requirement in Section 6.8.6.
+ bool ValidateInterFrameSize() const;
+ bool ParseReferenceOrderHint();
+ static int FindLatestBackwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+ static int FindEarliestBackwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+ static int FindLatestForwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+ static int FindReferenceWithSmallestOutputOrder(
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints);
+ bool SetFrameReferences(int8_t last_frame_idx,
+ int8_t gold_frame_idx); // 7.8.
+ bool ParseLoopFilterParameters(); // 5.9.11.
+ bool ParseDeltaQuantizer(int8_t* delta); // 5.9.13.
+ bool ParseQuantizerParameters(); // 5.9.12.
+ bool ParseSegmentationParameters(); // 5.9.14.
+ bool ParseQuantizerIndexDeltaParameters(); // 5.9.17.
+ bool ParseLoopFilterDeltaParameters(); // 5.9.18.
+ void ComputeSegmentLosslessAndQIndex();
+ bool ParseCdefParameters(); // 5.9.19.
+ bool ParseLoopRestorationParameters(); // 5.9.20.
+ bool ParseTxModeSyntax(); // 5.9.21.
+ bool ParseFrameReferenceModeSyntax(); // 5.9.23.
+ // Returns whether skip mode is allowed. When it returns true, it also sets
+ // the frame_header_.skip_mode_frame array.
+ bool IsSkipModeAllowed();
+ bool ParseSkipModeParameters(); // 5.9.22.
+ bool ReadAllowWarpedMotion();
+ bool ParseGlobalParamSyntax(
+ int ref, int index,
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+ prev_global_motions); // 5.9.25.
+ bool ParseGlobalMotionParameters(); // 5.9.24.
+ bool ParseFilmGrainParameters(); // 5.9.30.
+ bool ParseTileInfoSyntax(); // 5.9.15.
+ bool ParseFrameHeader(); // 5.9.
+ // |data| and |size| specify the payload data of the padding OBU.
+ // NOTE: Although the payload data is available in the bit_reader_ member,
+ // it is also passed to ParsePadding() as function parameters so that
+ // ParsePadding() can find the trailing bit of the OBU and skip over the
+ // payload data as an opaque chunk of data.
+ bool ParsePadding(const uint8_t* data, size_t size); // 5.7.
+ bool ParseMetadataScalability(); // 5.8.5 and 5.8.6.
+ bool ParseMetadataTimecode(); // 5.8.7.
+ // |data| and |size| specify the payload data of the metadata OBU.
+ // NOTE: Although the payload data is available in the bit_reader_ member,
+ // it is also passed to ParseMetadata() as function parameters so that
+ // ParseMetadata() can find the trailing bit of the OBU and either extract
+ // or skip over the payload data as an opaque chunk of data.
+ bool ParseMetadata(const uint8_t* data, size_t size); // 5.8.
+ // Adds and populates the TileBuffer for each tile in the tile group and
+ // updates |next_tile_group_start_|
+ bool AddTileBuffers(int start, int end, size_t total_size,
+ size_t tg_header_size, size_t bytes_consumed_so_far);
+ bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far); // 5.11.1.
+
+ // Parser elements.
+ std::unique_ptr<RawBitReader> bit_reader_;
+ const uint8_t* data_;
+ size_t size_;
+ const int operating_point_;
+
+ // OBU elements. Only valid if ParseOneFrame() completes successfully.
+ Vector<ObuHeader> obu_headers_;
+ ObuSequenceHeader sequence_header_ = {};
+ ObuFrameHeader frame_header_ = {};
+ Vector<TileBuffer> tile_buffers_;
+ ObuMetadata metadata_ = {};
+ // The expected starting tile number of the next Tile Group.
+ int next_tile_group_start_ = 0;
+ // If true, the sequence_header_ field is valid.
+ bool has_sequence_header_ = false;
+ // If true, the obu_extension_flag syntax element in the OBU header must be
+ // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
+ bool extension_disallowed_ = false;
+
+ BufferPool* const buffer_pool_;
+ DecoderState& decoder_state_;
+ // Used by ParseOneFrame() to populate the current frame that is being
+ // decoded. The invariant maintained is that this variable will be nullptr at
+ // the beginning and at the end of each call to ParseOneFrame(). This ensures
+ // that the ObuParser is not holding on to any references to the current
+ // frame once the ParseOneFrame() call is complete.
+ RefCountedBufferPtr current_frame_;
+
+ // For unit testing private functions.
+ friend class ObuParserTest;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_OBU_PARSER_H_
diff --git a/src/post_filter.h b/src/post_filter.h
new file mode 100644
index 0000000..800d51d
--- /dev/null
+++ b/src/post_filter.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_POST_FILTER_H_
+#define LIBGAV1_SRC_POST_FILTER_H_
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// This class applies in-loop filtering for each frame after it is
+// reconstructed. The in-loop filtering contains all post processing filtering
+// for the reconstructed frame, including deblock filter, CDEF, superres,
+// and loop restoration.
+// Historically, for example in libaom, loop filter refers to deblock filter.
+// To avoid name conflicts, we call this class PostFilter (post processing).
+// In-loop post filtering order is:
+// deblock --> CDEF --> super resolution--> loop restoration.
+// When CDEF and super resolution is not used, we can combine deblock
+// and restoration together to only filter frame buffer once.
+class PostFilter {
+ public:
+ // This class does not take ownership of the masks/restoration_info, but it
+ // may change their values.
+ //
+ // The overall flow of data in this class (for both single and multi-threaded
+ // cases) is as follows:
+ // -> Input: |frame_buffer_|.
+ // -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and
+ // |loop_restoration_buffer_|.
+ // -> Deblocking:
+ // * Input: |source_buffer_|
+ // * Output: |source_buffer_|
+ // -> CDEF:
+ // * Input: |source_buffer_|
+ // * Output: |cdef_buffer_|
+ // -> SuperRes:
+ // * Input: |cdef_buffer_|
+ // * Output: |superres_buffer_|
+ // -> Loop Restoration:
+ // * Input: |superres_buffer_|
+ // * Output: |loop_restoration_buffer_|.
+ // -> Now |frame_buffer_| contains the filtered frame.
+ PostFilter(const ObuFrameHeader& frame_header,
+ const ObuSequenceHeader& sequence_header,
+ FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer,
+ const dsp::Dsp* dsp, int do_post_filter_mask);
+
+ // non copyable/movable.
+ PostFilter(const PostFilter&) = delete;
+ PostFilter& operator=(const PostFilter&) = delete;
+ PostFilter(PostFilter&&) = delete;
+ PostFilter& operator=(PostFilter&&) = delete;
+
+ // The overall function that applies all post processing filtering with
+ // multiple threads.
+ // * The filtering order is:
+ // deblock --> CDEF --> super resolution--> loop restoration.
+ // * The output of each filter is the input for the following filter. A
+ // special case is that loop restoration needs a few rows of the deblocked
+ // frame and the entire cdef filtered frame:
+ // deblock --> CDEF --> super resolution --> loop restoration.
+ // | ^
+ // | |
+ // -----------> super resolution -----
+ // * Any of these filters could be present or absent.
+ // * |frame_buffer_| points to the decoded frame buffer. When
+ // ApplyFilteringThreaded() is called, |frame_buffer_| is modified by each
+ // of the filters as described below.
+ // Filter behavior (multi-threaded):
+ // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+ // If cdef and loop restoration are both on, then 4 rows (as
+ // specified by |kLoopRestorationBorderRows|) in every 64x64 block
+ // is copied into |loop_restoration_border_|.
+ // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as
+ // the input and the output is written into |cdef_buffer_| (which is
+ // the same as |source_buffer_|).
+ // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and
+ // |superres_line_buffer_| as the input and the output is written
+ // into |superres_buffer_| (which is just |cdef_buffer_| with a
+ // shift to the top).
+ // * Restoration: Near in-place filtering.
+ // Uses the |superres_buffer_| and |loop_restoration_border_|
+ // as the input and the output is written into
+ // |loop_restoration_buffer_| (which is just |superres_buffer_|
+ // with a shift to the left).
+ void ApplyFilteringThreaded();
+
+ // Does the overall post processing filter for one superblock row starting at
+ // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter
+ // will not be applied.
+ //
+ // Filter behavior (single-threaded):
+ // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+ // If cdef and loop restoration are both on, then 4 rows (as
+ // specified by |kLoopRestorationBorderRows|) in every 64x64 block
+ // is copied into |loop_restoration_border_|.
+ // * Cdef: In-place filtering. The output is written into |cdef_buffer_|
+ // (which is just |source_buffer_| with a shift to the top-left).
+ // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input
+ // and the output is written into |superres_buffer_| (which is
+ // just |cdef_buffer_| with a shift to the top).
+ // * Restoration: Near in-place filtering.
+ // Uses the |superres_buffer_| and |loop_restoration_border_|
+ // as the input and the output is written into
+ // |loop_restoration_buffer_| (which is just |superres_buffer_|
+ // with a shift to the left or top-left).
+ // Returns the index of the last row whose post processing is complete and can
+ // be used for referencing.
+ int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
+ bool do_deblock);
+
+ // Apply deblocking filter in one direction (specified by |loop_filter_type|)
+ // for the superblock row starting at |row4x4_start| for columns starting from
+ // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling)
+ // until the smallest multiple of 16 that is >= |column4x4_end| or until
+ // |frame_header_.columns4x4|, whichever is lower. This function must be
+ // called only if |DoDeblock()| returns true.
+ void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start,
+ int column4x4_start, int column4x4_end, int sb4x4);
+
+ static bool DoCdef(const ObuFrameHeader& frame_header,
+ int do_post_filter_mask) {
+ return (frame_header.cdef.bits > 0 ||
+ frame_header.cdef.y_primary_strength[0] > 0 ||
+ frame_header.cdef.y_secondary_strength[0] > 0 ||
+ frame_header.cdef.uv_primary_strength[0] > 0 ||
+ frame_header.cdef.uv_secondary_strength[0] > 0) &&
+ (do_post_filter_mask & 0x02) != 0;
+ }
+ bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); }
+ // If filter levels for Y plane (0 for vertical, 1 for horizontal),
+ // are all zero, deblock filter will not be applied.
+ static bool DoDeblock(const ObuFrameHeader& frame_header,
+ uint8_t do_post_filter_mask) {
+ return (frame_header.loop_filter.level[0] > 0 ||
+ frame_header.loop_filter.level[1] > 0) &&
+ (do_post_filter_mask & 0x01) != 0;
+ }
+ bool DoDeblock() const {
+ return DoDeblock(frame_header_, do_post_filter_mask_);
+ }
+
+ uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index,
+ ReferenceFrameType type,
+ int mode_id) const {
+ return deblock_filter_levels_[segment_id][level_index][type][mode_id];
+ }
+ // Computes the deblock filter levels using |delta_lf| and stores them in
+ // |deblock_filter_levels|.
+ void ComputeDeblockFilterLevels(
+ const int8_t delta_lf[kFrameLfCount],
+ uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+ [kNumReferenceFrameTypes][2]) const;
+ // Returns true if loop restoration will be performed for the given parameters
+ // and mask.
+ static bool DoRestoration(const LoopRestoration& loop_restoration,
+ uint8_t do_post_filter_mask, int num_planes) {
+ if (num_planes == kMaxPlanesMonochrome) {
+ return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone &&
+ (do_post_filter_mask & 0x08) != 0;
+ }
+ return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
+ loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
+ loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) &&
+ (do_post_filter_mask & 0x08) != 0;
+ }
+ bool DoRestoration() const {
+ return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_);
+ }
+
+ // Returns a pointer to the unfiltered buffer. This is used by the Tile class
+ // to determine where to write the output of the tile decoding process taking
+ // in-place filtering offsets into consideration.
+ uint8_t* GetUnfilteredBuffer(int plane) { return source_buffer_[plane]; }
+ const YuvBuffer& frame_buffer() const { return frame_buffer_; }
+
+ // Returns true if SuperRes will be performed for the given frame header and
+ // mask.
+ static bool DoSuperRes(const ObuFrameHeader& frame_header,
+ uint8_t do_post_filter_mask) {
+ return frame_header.width != frame_header.upscaled_width &&
+ (do_post_filter_mask & 0x04) != 0;
+ }
+ bool DoSuperRes() const {
+ return DoSuperRes(frame_header_, do_post_filter_mask_);
+ }
+ LoopRestorationInfo* restoration_info() const { return restoration_info_; }
+ uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
+ int row, int column) const {
+ return base_buffer + (row >> subsampling_y_[plane]) * stride +
+ ((column >> subsampling_x_[plane]) << pixel_size_log2_);
+ }
+ uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const {
+ return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane),
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+ }
+ uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const {
+ return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+ }
+ uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const {
+ return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+ }
+
+ template <typename Pixel>
+ static void ExtendFrame(Pixel* frame_start, int width, int height,
+ ptrdiff_t stride, int left, int right, int top,
+ int bottom);
+
+ private:
+ // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member
+ // functions.
+ using DeblockFilter = void (PostFilter::*)(int row4x4_start,
+ int column4x4_start);
+ // The lookup table for picking the deblock filter, according to deblock
+ // filter type.
+ const DeblockFilter deblock_filter_func_[2] = {
+ &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter};
+
+ // Functions common to all post filters.
+
+ // Extends the frame by setting the border pixel values to the one from its
+ // closest frame boundary.
+ void ExtendFrameBoundary(uint8_t* frame_start, int width, int height,
+ ptrdiff_t stride, int left, int right, int top,
+ int bottom) const;
+ // Extend frame boundary for referencing if the frame will be saved as a
+ // reference frame.
+ void ExtendBordersForReferenceFrame();
+ // Copies the deblocked pixels needed for loop restoration.
+ void CopyDeblockedPixels(Plane plane, int row4x4);
+ // Copies the border for one superblock row. If |for_loop_restoration| is
+ // true, then it assumes that the border extension is being performed for the
+ // input of the loop restoration process. If |for_loop_restoration| is false,
+ // then it assumes that the border extension is being performed for using the
+ // current frame as a reference frame. In this case, |progress_row_| is also
+ // updated.
+ void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+ bool for_loop_restoration);
+ // Sets up the |loop_restoration_border_| for loop restoration.
+ // TODO(linfengz): Unify duplicates in the following two functions if
+ // possible.
+ // This is called when there is no CDEF filter. We copy rows from
+ // |superres_buffer_| and do the line extension.
+ void SetupLoopRestorationBorder(int row4x4_start);
+ // This is called when there is CDEF filter. We copy rows from
+ // |source_buffer_|, apply superres and do the line extension.
+ void SetupLoopRestorationBorder(int row4x4_start, int sb4x4);
+ // Returns true if we can perform border extension in loop (i.e.) without
+ // waiting until the entire frame is decoded. If intra_block_copy is true, we
+ // do in-loop border extension only if the upscaled_width is the same as 4 *
+ // columns4x4. Otherwise, we cannot do in loop border extension since those
+ // pixels may be used by intra block copy.
+ bool DoBorderExtensionInLoop() const {
+ return !frame_header_.allow_intrabc ||
+ frame_header_.upscaled_width ==
+ MultiplyBy4(frame_header_.columns4x4);
+ }
+ template <typename Pixel>
+ void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height,
+ Pixel* dst, ptrdiff_t dst_stride) {
+ assert(height > 0);
+ do {
+ memcpy(dst, src, width * sizeof(Pixel));
+ src += src_stride;
+ dst += dst_stride;
+ } while (--height != 0);
+ }
+
+ // Worker function used for multi-threaded implementation of Deblocking, CDEF
+ // and Loop Restoration.
+ using WorkerFunction = void (PostFilter::*)(std::atomic<int>* row4x4_atomic);
+ // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling
+ // thread and returns once all the jobs are completed.
+ void RunJobs(WorkerFunction worker);
+
+ // Functions for the Deblocking filter.
+
+ static int GetIndex(int row4x4) { return DivideBy4(row4x4); }
+ static int GetShift(int row4x4, int column4x4) {
+ return ((row4x4 & 3) << 4) | column4x4;
+ }
+ int GetDeblockUnitId(int row_unit, int column_unit) const {
+ return row_unit * num_64x64_blocks_per_row_ + column_unit;
+ }
+ bool GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+ uint8_t* level, int* step,
+ int* filter_length) const;
+ void GetHorizontalDeblockFilterEdgeInfoUV(int row4x4, int column4x4,
+ uint8_t* level_u, uint8_t* level_v,
+ int* step,
+ int* filter_length) const;
+ bool GetVerticalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+ BlockParameters* const* bp_ptr,
+ uint8_t* level, int* step,
+ int* filter_length) const;
+ void GetVerticalDeblockFilterEdgeInfoUV(int column4x4,
+ BlockParameters* const* bp_ptr,
+ uint8_t* level_u, uint8_t* level_v,
+ int* step, int* filter_length) const;
+ void HorizontalDeblockFilter(int row4x4_start, int column4x4_start);
+ void VerticalDeblockFilter(int row4x4_start, int column4x4_start);
+ // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct
+ // signature.
+ static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter),
+ DeblockFilter>::value,
+ "");
+ static_assert(std::is_same<decltype(&PostFilter::VerticalDeblockFilter),
+ DeblockFilter>::value,
+ "");
+ // Applies deblock filtering for the superblock row starting at |row4x4| with
+ // a height of 4*|sb4x4|.
+ void ApplyDeblockFilterForOneSuperBlockRow(int row4x4, int sb4x4);
+ // Worker function used for multi-threaded deblocking.
+ template <LoopFilterType loop_filter_type>
+ void DeblockFilterWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(
+ std::is_same<
+ decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>),
+ WorkerFunction>::value,
+ "");
+ static_assert(
+ std::is_same<
+ decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>),
+ WorkerFunction>::value,
+ "");
+
+ // Functions for the cdef filter.
+
+ // Copies the deblocked pixels necessary for use by the multi-threaded cdef
+ // implementation into |cdef_border_|.
+ void SetupCdefBorder(int row4x4);
+ // This function prepares the input source block for cdef filtering. The input
+ // source block contains a 12x12 block, with the inner 8x8 as the desired
+ // filter region. It pads the block if the 12x12 block includes out of frame
+ // pixels with a large value. This achieves the required behavior defined in
+ // section 5.11.52 of the spec.
+ template <typename Pixel>
+ void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
+ int column4x4, uint16_t* cdef_source,
+ ptrdiff_t cdef_stride, bool y_plane,
+ const uint8_t border_columns[kMaxPlanes][256],
+ bool use_border_columns);
+ // Applies cdef for one 64x64 block.
+ template <typename Pixel>
+ void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
+ int block_height4x4, int row4x4_start,
+ int column4x4_start,
+ uint8_t border_columns[2][kMaxPlanes][256],
+ bool use_border_columns[2][2]);
+ // Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code
+ // duplication.
+ void ApplyCdefForOneSuperBlockRowHelper(
+ uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+ int row4x4, int block_height4x4);
+ // Applies CDEF filtering for the superblock row starting at |row4x4| with a
+ // height of 4*|sb4x4|.
+ void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row);
+ // Worker function used for multi-threaded CDEF.
+ void ApplyCdefWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(std::is_same<decltype(&PostFilter::ApplyCdefWorker),
+ WorkerFunction>::value,
+ "");
+
+ // Functions for the SuperRes filter.
+
+ // Applies super resolution for the |src| for |rows[plane]| rows of each
+ // plane. If |line_buffer_row| is larger than or equal to 0, one more row will
+ // be processed, the line buffer indicated by |line_buffer_row| will be used
+ // as the source.
+ void ApplySuperRes(
+ const std::array<uint8_t*, kMaxPlanes>& src,
+ const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
+ const std::array<uint8_t*, kMaxPlanes>& dst); // Section 7.16.
+ // Applies SuperRes for the superblock row starting at |row4x4| with a height
+ // of 4*|sb4x4|.
+ void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
+ bool is_last_row);
+ void ApplySuperResThreaded();
+
+ // Functions for the Loop Restoration filter.
+
+ // Notes about Loop Restoration:
+ // (1). Loop restoration processing unit size is default to 64x64.
+ // Only when the remaining filtering area is smaller than 64x64, the
+ // processing unit size is the actual area size.
+ // For U/V plane, it is (64 >> subsampling_x) x (64 >> subsampling_y).
+ // (2). Loop restoration unit size can be 64x64, 128x128, 256x256 for Y
+ // plane. The unit size for chroma can be the same or half, depending on
+ // subsampling. If either subsampling_x or subsampling_y is one, unit size
+ // is halved on both x and y sides.
+ // All loop restoration units have the same size for one plane.
+ // One loop restoration unit could contain multiple processing units.
+ // But they share the same sets of loop restoration parameters.
+ // (3). Loop restoration has a row offset, kRestorationUnitOffset = 8. The
+ // size of first row of loop restoration units and processing units is
+ // shrunk by the offset.
+ // (4). Loop restoration units wrap the bottom and the right of the frame,
+ // if the remaining area is small. The criteria is whether the number of
+ // remaining rows/columns is smaller than half of loop restoration unit
+ // size.
+ // For example, if the frame size is 140x140, loop restoration unit size is
+ // 128x128. The size of the first loop restoration unit is 128x(128-8) =
+ // 128 columns x 120 rows.
+ // Since 140 - 120 < 128/2. The remaining 20 rows will be folded to the loop
+ // restoration unit. Similarly, the remaining 12 columns will also be folded
+ // to current loop restoration unit. So, even frame size is 140x140,
+ // there's only one loop restoration unit. Suppose processing unit is 64x64,
+ // then sizes of the first row of processing units are 64x56, 64x56, 12x56,
+ // respectively. The second row is 64x64, 64x64, 12x64.
+ // The third row is 64x20, 64x20, 12x20.
+
+ // |stride| is shared by |src_buffer| and |dst_buffer|.
+ template <typename Pixel>
+ void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride,
+ Plane plane, int plane_height,
+ int plane_width, int y, int unit_row,
+ int current_process_unit_height,
+ int plane_unit_size, Pixel* dst_buffer);
+ // Applies loop restoration for the superblock row starting at |row4x4_start|
+ // with a height of 4*|sb4x4|.
+ template <typename Pixel>
+ void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4);
+ // Helper function that calls the right variant of
+ // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth.
+ void ApplyLoopRestoration(int row4x4_start, int sb4x4);
+ // Worker function used for multithreaded Loop Restoration.
+ void ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(std::is_same<decltype(&PostFilter::ApplyLoopRestorationWorker),
+ WorkerFunction>::value,
+ "");
+
+ const ObuFrameHeader& frame_header_;
+ const LoopRestoration& loop_restoration_;
+ const dsp::Dsp& dsp_;
+ const int num_64x64_blocks_per_row_;
+ const int upscaled_width_;
+ const int width_;
+ const int height_;
+ const int8_t bitdepth_;
+ const int8_t subsampling_x_[kMaxPlanes];
+ const int8_t subsampling_y_[kMaxPlanes];
+ const int8_t planes_;
+ const int pixel_size_log2_;
+ const uint8_t* const inner_thresh_;
+ const uint8_t* const outer_thresh_;
+ const bool needs_chroma_deblock_;
+ // This stores the deblocking filter levels assuming that the delta is zero.
+ // This will be used by all superblocks whose delta is zero (without having to
+ // recompute them). The dimensions (in order) are: segment_id, level_index
+ // (based on plane and direction), reference_frame and mode_id.
+ uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+ [kNumReferenceFrameTypes][2];
+ // Stores the SuperRes info for the frame.
+ struct {
+ int upscaled_width;
+ int initial_subpixel_x;
+ int step;
+ } super_res_info_[kMaxPlanes];
+ const Array2D<int16_t>& cdef_index_;
+ const Array2D<TransformSize>& inter_transform_sizes_;
+ LoopRestorationInfo* const restoration_info_;
+ uint8_t* const superres_coefficients_[kNumPlaneTypes];
+ // Line buffer used by multi-threaded ApplySuperRes().
+ // In the multi-threaded case, this buffer will store the last downscaled row
+ // input of each thread to avoid overwrites by the first upscaled row output
+ // of the thread below it.
+ YuvBuffer& superres_line_buffer_;
+ const BlockParametersHolder& block_parameters_;
+ // Frame buffer to hold cdef filtered frame.
+ YuvBuffer cdef_filtered_buffer_;
+ // Input frame buffer.
+ YuvBuffer& frame_buffer_;
+ // A view into |frame_buffer_| that points to the input and output of the
+ // deblocking process.
+ uint8_t* source_buffer_[kMaxPlanes];
+ // A view into |frame_buffer_| that points to the output of the CDEF filtered
+ // planes (to facilitate in-place CDEF filtering).
+ uint8_t* cdef_buffer_[kMaxPlanes];
+ // A view into |frame_buffer_| that points to the planes after the SuperRes
+ // filter is applied (to facilitate in-place SuperRes).
+ uint8_t* superres_buffer_[kMaxPlanes];
+ // A view into |frame_buffer_| that points to the output of the Loop Restored
+ // planes (to facilitate in-place Loop Restoration).
+ uint8_t* loop_restoration_buffer_[kMaxPlanes];
+ YuvBuffer& cdef_border_;
+ // Buffer used to store the border pixels that are necessary for loop
+ // restoration. This buffer will store 4 rows for every 64x64 block (4 rows
+ // for every 32x32 for chroma with subsampling). The indices of the rows that
+ // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of
+ // this buffer are never populated and never used.
+ // This buffer is used only when both of the following conditions are true:
+ // (1). Loop Restoration is on.
+ // (2). Cdef is on, or multi-threading is enabled for post filter.
+ YuvBuffer& loop_restoration_border_;
+ const uint8_t do_post_filter_mask_;
+ ThreadPool* const thread_pool_;
+
+ // Tracks the progress of the post filters.
+ int progress_row_ = -1;
+
+ // A block buffer to hold the input that is converted to uint16_t before
+ // cdef filtering. Only used in single threaded case. Y plane is processed
+ // separately. U and V planes are processed together. So it is sufficient to
+ // have this buffer to accommodate 2 planes at a time.
+ uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+
+ template <int bitdepth, typename Pixel>
+ friend class PostFilterSuperResTest;
+
+ template <int bitdepth, typename Pixel>
+ friend class PostFilterHelperFuncTest;
+};
+
+extern template void PostFilter::ExtendFrame<uint8_t>(uint8_t* frame_start,
+ int width, int height,
+ ptrdiff_t stride,
+ int left, int right,
+ int top, int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void PostFilter::ExtendFrame<uint16_t>(uint16_t* frame_start,
+ int width, int height,
+ ptrdiff_t stride,
+ int left, int right,
+ int top, int bottom);
+#endif
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_POST_FILTER_H_
diff --git a/src/post_filter/cdef.cc b/src/post_filter/cdef.cc
new file mode 100644
index 0000000..994f448
--- /dev/null
+++ b/src/post_filter/cdef.cc
@@ -0,0 +1,660 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kStep64x64 = 16; // =64/4.
+constexpr int kCdefSkip = 8;
+
+constexpr uint8_t kCdefUvDirection[2][2][8] = {
+ {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
+ {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
+
+constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
+
+template <typename Pixel>
+void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
+ bool is_frame_left, bool is_frame_right,
+ uint16_t* const dst, const Pixel* left_border = nullptr) {
+ if (sizeof(src[0]) == sizeof(dst[0])) {
+ if (is_frame_left) {
+ Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
+ } else if (left_border == nullptr) {
+ memcpy(dst - kCdefBorder, src - kCdefBorder,
+ kCdefBorder * sizeof(dst[0]));
+ } else {
+ memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
+ }
+ memcpy(dst, src, block_width * sizeof(dst[0]));
+ if (is_frame_right) {
+ Memset(dst + block_width, kCdefLargeValue,
+ unit_width + kCdefBorder - block_width);
+ } else {
+ memcpy(dst + block_width, src + block_width,
+ (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
+ }
+ return;
+ }
+ if (is_frame_left) {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = static_cast<uint16_t>(kCdefLargeValue);
+ }
+ } else if (left_border == nullptr) {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = src[x];
+ }
+ } else {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = left_border[x + kCdefBorder];
+ }
+ }
+ for (int x = 0; x < block_width; ++x) {
+ dst[x] = src[x];
+ }
+ for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
+ dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
+ }
+}
+
+// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
+// |dst|.
+void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
+ int dst_stride, int width, int height, size_t pixel_size) {
+ int y = height;
+ do {
+ memcpy(dst, src, width * pixel_size);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--y != 0);
+}
+
+} // namespace
+
+void PostFilter::SetupCdefBorder(int row4x4) {
+ assert(row4x4 >= 0);
+ assert(DoCdef());
+ int plane = kPlaneY;
+ do {
+ const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+ const ptrdiff_t dst_stride = cdef_border_.stride(plane);
+ const int row_offset = DivideBy4(row4x4);
+ const int num_pixels = SubsampledValue(
+ MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
+ const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
+ subsampling_y_[plane]);
+ for (int i = 0; i < 4; ++i) {
+ const int row = kCdefBorderRows[subsampling_y_[plane]][i];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ if (absolute_row >= plane_height) break;
+ const uint8_t* src =
+ GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * src_stride;
+ uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
+ memcpy(dst, src, row_width);
+ }
+ } while (++plane < planes_);
+}
+
+template <typename Pixel>
+void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
+ int row4x4, int column4x4,
+ uint16_t* cdef_source, ptrdiff_t cdef_stride,
+ const bool y_plane,
+ const uint8_t border_columns[kMaxPlanes][256],
+ bool use_border_columns) {
+ assert(y_plane || planes_ == kMaxPlanes);
+ const int max_planes = y_plane ? 1 : kMaxPlanes;
+ const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
+ const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
+ const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
+ const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
+ const int plane_width = SubsampledValue(width_, subsampling_x);
+ const int plane_height = SubsampledValue(height_, subsampling_y);
+ const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
+ const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
+ // unit_width, unit_height are the same as block_width, block_height unless
+ // it reaches the frame boundary, where block_width < 64 or
+ // block_height < 64. unit_width, unit_height guarantee we build blocks on
+ // a multiple of 8.
+ const int unit_width = Align(block_width, 8 >> subsampling_x);
+ const int unit_height = Align(block_height, 8 >> subsampling_y);
+ const bool is_frame_left = column4x4 == 0;
+ const bool is_frame_right = start_x + block_width >= plane_width;
+ const bool is_frame_top = row4x4 == 0;
+ const bool is_frame_bottom = start_y + block_height >= plane_height;
+ const int y_offset = is_frame_top ? 0 : kCdefBorder;
+ const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
+
+ for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
+ uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
+ kCdefUnitSizeWithBorders *
+ kCdefUnitSizeWithBorders;
+ const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+ const Pixel* src_buffer =
+ reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
+ (start_y - y_offset) * src_stride + start_x;
+ const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
+ const Pixel* cdef_border =
+ (thread_pool_ == nullptr)
+ ? nullptr
+ : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
+ cdef_border_row_offset * cdef_border_stride + start_x;
+
+ // All the copying code will use negative indices for populating the left
+ // border. So the starting point is set to kCdefBorder.
+ cdef_src += kCdefBorder;
+
+ // Copy the top 2 rows as follows;
+ // If is_frame_top is true, both the rows are set to kCdefLargeValue.
+ // Otherwise:
+ // If multi-threaded filtering is off, the rows are copied from
+ // |src_buffer|.
+ // Otherwise, the rows are copied from |cdef_border|.
+ if (is_frame_top) {
+ for (int y = 0; y < kCdefBorder; ++y) {
+ Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+ unit_width + 2 * kCdefBorder);
+ cdef_src += cdef_stride;
+ }
+ } else {
+ const Pixel* top_border =
+ (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+ const int top_border_stride =
+ (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+ for (int y = 0; y < kCdefBorder; ++y) {
+ CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src);
+ top_border += top_border_stride;
+ cdef_src += cdef_stride;
+ // We need to increment |src_buffer| and |cdef_border| in this loop to
+ // set them up for the subsequent loops below.
+ src_buffer += src_stride;
+ cdef_border += cdef_border_stride;
+ }
+ }
+
+ // Copy the body as follows;
+ // If multi-threaded filtering is off or if is_frame_bottom is true, all the
+ // rows are copied from |src_buffer|.
+ // Otherwise, the first |block_height|-kCdefBorder rows are copied from
+ // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
+ int y = block_height;
+ const int y_threshold =
+ (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
+ const Pixel* left_border =
+ (thread_pool_ == nullptr || !use_border_columns)
+ ? nullptr
+ : reinterpret_cast<const Pixel*>(border_columns[plane]);
+ do {
+ CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src, left_border);
+ cdef_src += cdef_stride;
+ src_buffer += src_stride;
+ if (left_border != nullptr) left_border += kCdefBorder;
+ } while (--y != y_threshold);
+
+ if (y > 0) {
+ assert(y == kCdefBorder);
+ // |cdef_border| now points to the top 2 rows of the current block. For
+ // the next loop, we need it to point to the bottom 2 rows of the
+ // current block. So increment it by 2 rows.
+ cdef_border += MultiplyBy2(cdef_border_stride);
+ for (int i = 0; i < kCdefBorder; ++i) {
+ CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src);
+ cdef_src += cdef_stride;
+ cdef_border += cdef_border_stride;
+ }
+ }
+
+ // Copy the bottom 2 rows as follows;
+ // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
+ // Otherwise:
+ // If multi-threaded filtering is off, the rows are copied from
+ // |src_buffer|.
+ // Otherwise, the rows are copied from |cdef_border|.
+ y = 0;
+ if (is_frame_bottom) {
+ do {
+ Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+ unit_width + 2 * kCdefBorder);
+ cdef_src += cdef_stride;
+ } while (++y < kCdefBorder + unit_height - block_height);
+ } else {
+ const Pixel* bottom_border =
+ (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+ const int bottom_border_stride =
+ (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+ do {
+ CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src);
+ bottom_border += bottom_border_stride;
+ cdef_src += cdef_stride;
+ } while (++y < kCdefBorder + unit_height - block_height);
+ }
+ }
+}
+
+template <typename Pixel>
+void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
+ const int block_width4x4,
+ const int block_height4x4,
+ const int row4x4_start,
+ const int column4x4_start,
+ uint8_t border_columns[2][kMaxPlanes][256],
+ bool use_border_columns[2][2]) {
+ // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
+ static constexpr int kStep = 8;
+ static constexpr int kStep4x4 = 2;
+
+ int cdef_buffer_row_base_stride[kMaxPlanes];
+ uint8_t* cdef_buffer_row_base[kMaxPlanes];
+ int src_buffer_row_base_stride[kMaxPlanes];
+ const uint8_t* src_buffer_row_base[kMaxPlanes];
+ const uint16_t* cdef_src_row_base[kMaxPlanes];
+ int cdef_src_row_base_stride[kMaxPlanes];
+ int column_step[kMaxPlanes];
+ assert(planes_ >= 1);
+ int plane = kPlaneY;
+ do {
+ cdef_buffer_row_base[plane] =
+ GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
+ cdef_buffer_row_base_stride[plane] =
+ frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+ src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
+ row4x4_start, column4x4_start);
+ src_buffer_row_base_stride[plane] =
+ frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+ cdef_src_row_base[plane] =
+ cdef_block +
+ static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
+ kCdefUnitSizeWithBorders +
+ kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+ cdef_src_row_base_stride[plane] =
+ kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
+ column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
+ } while (++plane < planes_);
+
+ // |border_columns| contains two buffers. In each call to this function, we
+ // will use one of them as the "destination" for the current call. And the
+ // other one as the "source" for the current call (which would have been the
+ // "destination" of the previous call). We will use the src_index to populate
+ // the borders which were backed up in the previous call. We will use the
+ // dst_index to populate the borders to be used in the next call.
+ const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
+ const int border_columns_dst_index = border_columns_src_index ^ 1;
+
+ if (index == -1) {
+ if (thread_pool_ == nullptr) {
+ int plane = kPlaneY;
+ do {
+ CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+ cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+ MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ } while (++plane < planes_);
+ }
+ use_border_columns[border_columns_dst_index][0] = false;
+ use_border_columns[border_columns_dst_index][1] = false;
+ return;
+ }
+
+ const bool is_frame_right =
+ MultiplyBy4(column4x4_start) + MultiplyBy4(block_width4x4) >= width_;
+ if (!is_frame_right && thread_pool_ != nullptr) {
+ // Backup the last 2 columns for use in the next iteration.
+ use_border_columns[border_columns_dst_index][0] = true;
+ const uint8_t* src_line =
+ GetSourceBuffer(kPlaneY, row4x4_start,
+ column4x4_start + block_width4x4) -
+ kCdefBorder * sizeof(Pixel);
+ CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
+ border_columns[border_columns_dst_index][kPlaneY],
+ kCdefBorder * sizeof(Pixel), kCdefBorder,
+ MultiplyBy4(block_height4x4), sizeof(Pixel));
+ }
+
+ PrepareCdefBlock<Pixel>(
+ block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+ cdef_block, kCdefUnitSizeWithBorders, true,
+ (border_columns != nullptr) ? border_columns[border_columns_src_index]
+ : nullptr,
+ use_border_columns[border_columns_src_index][0]);
+
+ // Stored direction used during the u/v pass. If bit 3 is set, then block is
+ // a skip.
+ uint8_t direction_y[8 * 8];
+ int y_index = 0;
+
+ const uint8_t y_primary_strength =
+ frame_header_.cdef.y_primary_strength[index];
+ const uint8_t y_secondary_strength =
+ frame_header_.cdef.y_secondary_strength[index];
+ // y_strength_index is 0 for both primary and secondary strengths being
+ // non-zero, 1 for primary only, 2 for secondary only. This will be updated
+ // with y_primary_strength after variance is applied.
+ int y_strength_index = static_cast<int>(y_secondary_strength == 0);
+
+ const bool compute_direction_and_variance =
+ (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
+ BlockParameters* const* bp_row0_base =
+ block_parameters_.Address(row4x4_start, column4x4_start);
+ BlockParameters* const* bp_row1_base =
+ bp_row0_base + block_parameters_.columns4x4();
+ const int bp_stride = MultiplyBy2(block_parameters_.columns4x4());
+ int row4x4 = row4x4_start;
+ do {
+ uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
+ const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+ const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
+ BlockParameters* const* bp0 = bp_row0_base;
+ BlockParameters* const* bp1 = bp_row1_base;
+ int column4x4 = column4x4_start;
+ do {
+ const int block_width = kStep;
+ const int block_height = kStep;
+ const int cdef_stride = frame_buffer_.stride(kPlaneY);
+ uint8_t* const cdef_buffer = cdef_buffer_base;
+ const uint16_t* const cdef_src = cdef_src_base;
+ const int src_stride = frame_buffer_.stride(kPlaneY);
+ const uint8_t* const src_buffer = src_buffer_base;
+
+ const bool skip = (*bp0)->skip && (*(bp0 + 1))->skip && (*bp1)->skip &&
+ (*(bp1 + 1))->skip;
+
+ if (skip) { // No cdef filtering.
+ direction_y[y_index] = kCdefSkip;
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
+ } else {
+ // Zero out residual skip flag.
+ direction_y[y_index] = 0;
+
+ int variance = 0;
+ if (compute_direction_and_variance) {
+ if (thread_pool_ == nullptr ||
+ row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
+ dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+ &variance);
+ } else if (sizeof(Pixel) == 2) {
+ dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
+ &direction_y[y_index], &variance);
+ } else {
+ // If we are in the last row4x4 for this unit, then the last two
+ // input rows have to come from |cdef_border_|. Since we already
+ // have |cdef_src| populated correctly, use that as the input
+ // for the direction process.
+ uint8_t direction_src[8][8];
+ const uint16_t* cdef_src_line = cdef_src;
+ for (auto& direction_src_line : direction_src) {
+ for (int i = 0; i < 8; ++i) {
+ direction_src_line[i] = cdef_src_line[i];
+ }
+ cdef_src_line += kCdefUnitSizeWithBorders;
+ }
+ dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
+ &variance);
+ }
+ }
+ const int direction =
+ (y_primary_strength == 0) ? 0 : direction_y[y_index];
+ const int variance_strength =
+ ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) : 0;
+ const uint8_t primary_strength =
+ (variance != 0)
+ ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
+ : 0;
+ if ((primary_strength | y_secondary_strength) == 0) {
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
+ } else {
+ const int strength_index =
+ y_strength_index | (static_cast<int>(primary_strength == 0) << 1);
+ dsp_.cdef_filters[1][strength_index](
+ cdef_src, kCdefUnitSizeWithBorders, block_height,
+ primary_strength, y_secondary_strength,
+ frame_header_.cdef.damping, direction, cdef_buffer, cdef_stride);
+ }
+ }
+ cdef_buffer_base += column_step[kPlaneY];
+ src_buffer_base += column_step[kPlaneY];
+ cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
+
+ bp0 += kStep4x4;
+ bp1 += kStep4x4;
+ column4x4 += kStep4x4;
+ y_index++;
+ } while (column4x4 < column4x4_start + block_width4x4);
+
+ cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
+ src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+ cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
+ bp_row0_base += bp_stride;
+ bp_row1_base += bp_stride;
+ row4x4 += kStep4x4;
+ } while (row4x4 < row4x4_start + block_height4x4);
+
+ if (planes_ == kMaxPlanesMonochrome) {
+ return;
+ }
+
+ const uint8_t uv_primary_strength =
+ frame_header_.cdef.uv_primary_strength[index];
+ const uint8_t uv_secondary_strength =
+ frame_header_.cdef.uv_secondary_strength[index];
+
+ if ((uv_primary_strength | uv_secondary_strength) == 0) {
+ if (thread_pool_ == nullptr) {
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+ cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+ MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ }
+ }
+ use_border_columns[border_columns_dst_index][1] = false;
+ return;
+ }
+
+ if (!is_frame_right && thread_pool_ != nullptr) {
+ use_border_columns[border_columns_dst_index][1] = true;
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ // Backup the last 2 columns for use in the next iteration.
+ const uint8_t* src_line =
+ GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
+ column4x4_start + block_width4x4) -
+ kCdefBorder * sizeof(Pixel);
+ CopyPixels(src_line, frame_buffer_.stride(plane),
+ border_columns[border_columns_dst_index][plane],
+ kCdefBorder * sizeof(Pixel), kCdefBorder,
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ }
+ }
+
+ PrepareCdefBlock<Pixel>(
+ block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+ cdef_block, kCdefUnitSizeWithBorders, false,
+ (border_columns != nullptr) ? border_columns[border_columns_src_index]
+ : nullptr,
+ use_border_columns[border_columns_src_index][1]);
+
+ // uv_strength_index is 0 for both primary and secondary strengths being
+ // non-zero, 1 for primary only, 2 for secondary only.
+ const int uv_strength_index =
+ (static_cast<int>(uv_primary_strength == 0) << 1) |
+ static_cast<int>(uv_secondary_strength == 0);
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ const int8_t subsampling_x = subsampling_x_[plane];
+ const int8_t subsampling_y = subsampling_y_[plane];
+ const int block_width = kStep >> subsampling_x;
+ const int block_height = kStep >> subsampling_y;
+ int row4x4 = row4x4_start;
+
+ y_index = 0;
+ do {
+ uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
+ const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+ const uint16_t* cdef_src_base = cdef_src_row_base[plane];
+ int column4x4 = column4x4_start;
+ do {
+ const int cdef_stride = frame_buffer_.stride(plane);
+ uint8_t* const cdef_buffer = cdef_buffer_base;
+ const int src_stride = frame_buffer_.stride(plane);
+ const uint8_t* const src_buffer = src_buffer_base;
+ const uint16_t* const cdef_src = cdef_src_base;
+ const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
+ int dual_cdef = 0;
+
+ if (skip) { // No cdef filtering.
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
+ } else {
+ // Make sure block pair is not out of bounds.
+ if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
+ // Enable dual processing if subsampling_x is 1.
+ dual_cdef = subsampling_x;
+ }
+
+ int direction = (uv_primary_strength == 0)
+ ? 0
+ : kCdefUvDirection[subsampling_x][subsampling_y]
+ [direction_y[y_index]];
+
+ if (dual_cdef != 0) {
+ if (uv_primary_strength &&
+ direction_y[y_index] != direction_y[y_index + 1]) {
+ // Disable dual processing if the second block of the pair does
+ // not have the same direction.
+ dual_cdef = 0;
+ }
+
+ // Disable dual processing if the second block of the pair is a
+ // skip.
+ if (direction_y[y_index + 1] == kCdefSkip) {
+ dual_cdef = 0;
+ }
+ }
+
+ // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
+ const int width_index = dual_cdef | (subsampling_x ^ 1);
+ dsp_.cdef_filters[width_index][uv_strength_index](
+ cdef_src, kCdefUnitSizeWithBorders, block_height,
+ uv_primary_strength, uv_secondary_strength,
+ frame_header_.cdef.damping - 1, direction, cdef_buffer,
+ cdef_stride);
+ }
+ // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
+ // so adjust the pointers and indexes for 2 blocks.
+ cdef_buffer_base += column_step[plane] << dual_cdef;
+ src_buffer_base += column_step[plane] << dual_cdef;
+ cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
+ column4x4 += kStep4x4 << dual_cdef;
+ y_index += 1 << dual_cdef;
+ } while (column4x4 < column4x4_start + block_width4x4);
+
+ cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
+ src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+ cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
+ row4x4 += kStep4x4;
+ } while (row4x4 < row4x4_start + block_height4x4);
+ }
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
+ uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+ int row4x4, int block_height4x4) {
+ bool use_border_columns[2][2] = {};
+ for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
+ column4x4 += kStep64x64) {
+ const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
+ const int block_width4x4 =
+ std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
+ block_height4x4, row4x4, column4x4,
+ border_columns, use_border_columns);
+ continue;
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
+ block_height4x4, row4x4, column4x4,
+ border_columns, use_border_columns);
+ }
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
+ bool is_last_row) {
+ assert(row4x4_start >= 0);
+ assert(DoCdef());
+ for (int y = 0; y < sb4x4; y += kStep64x64) {
+ const int row4x4 = row4x4_start + y;
+ if (row4x4 >= frame_header_.rows4x4) return;
+
+ // Apply cdef for the last 8 rows of the previous superblock row.
+ // One exception: If the superblock size is 128x128 and is_last_row is true,
+ // then we simply apply cdef for the entire superblock row without any lag.
+ // In that case, apply cdef for the previous superblock row only during the
+ // first iteration (y == 0).
+ if (row4x4 > 0 && (!is_last_row || y == 0)) {
+ assert(row4x4 >= 16);
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
+ }
+
+ // Apply cdef for the current superblock row. If this is the last superblock
+ // row we apply cdef for all the rows, otherwise we leave out the last 8
+ // rows.
+ const int block_height4x4 =
+ std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+ const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
+ if (height4x4 > 0) {
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
+ height4x4);
+ }
+ }
+}
+
+void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
+ int row4x4;
+ uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+ // Each border_column buffer has to store 64 rows and 2 columns for each
+ // plane. For 10bit, that is 64*2*2 = 256 bytes.
+ alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
+ while ((row4x4 = row4x4_atomic->fetch_add(
+ kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
+ const int block_height4x4 =
+ std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
+ block_height4x4);
+ }
+}
+
+} // namespace libgav1
diff --git a/src/post_filter/deblock.cc b/src/post_filter/deblock.cc
new file mode 100644
index 0000000..9b5ed0f
--- /dev/null
+++ b/src/post_filter/deblock.cc
@@ -0,0 +1,523 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <atomic>
+
+#include "src/post_filter.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t HevThresh(int level) { return DivideBy16(level); }
+
+// GetLoopFilterSize* functions depend on this exact ordering of the
+// LoopFilterSize enums.
+static_assert(dsp::kLoopFilterSize4 == 0, "");
+static_assert(dsp::kLoopFilterSize6 == 1, "");
+static_assert(dsp::kLoopFilterSize8 == 2, "");
+static_assert(dsp::kLoopFilterSize14 == 3, "");
+
+dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) {
+ // |filter_length| must be a power of 2.
+ assert((filter_length & (filter_length - 1)) == 0);
+ // This code is the branch free equivalent of:
+ // if (filter_length == 4) return kLoopFilterSize4;
+ // if (filter_length == 8) return kLoopFilterSize8;
+ // return kLoopFilterSize14;
+ return static_cast<dsp::LoopFilterSize>(
+ MultiplyBy2(static_cast<int>(filter_length > 4)) +
+ static_cast<int>(filter_length > 8));
+}
+
+constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) {
+ // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4,
+ // otherwise size is kLoopFilterSize6.
+ return static_cast<dsp::LoopFilterSize>(filter_length != 4);
+}
+
+bool NonBlockBorderNeedsFilter(const BlockParameters& bp, int filter_id,
+ uint8_t* const level) {
+ if (bp.deblock_filter_level[filter_id] == 0 || (bp.skip && bp.is_inter)) {
+ return false;
+ }
+ *level = bp.deblock_filter_level[filter_id];
+ return true;
+}
+
+// 7.14.5.
+void ComputeDeblockFilterLevelsHelper(
+ const ObuFrameHeader& frame_header, int segment_id, int level_index,
+ const int8_t delta_lf[kFrameLfCount],
+ uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
+ const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
+ uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0,
+ kMaxLoopFilterValue);
+ const auto feature = static_cast<SegmentFeature>(
+ kSegmentFeatureLoopFilterYVertical + level_index);
+ level =
+ Clip3(level + frame_header.segmentation.feature_data[segment_id][feature],
+ 0, kMaxLoopFilterValue);
+ if (!frame_header.loop_filter.delta_enabled) {
+ static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
+ memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
+ return;
+ }
+ assert(frame_header.loop_filter.delta_enabled);
+ const int shift = level >> 5;
+ deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
+ level +
+ LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
+ shift),
+ 0, kMaxLoopFilterValue);
+ // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
+ // not have to be populated.
+ for (int reference_frame = kReferenceFrameIntra + 1;
+ reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
+ for (int mode_id = 0; mode_id < 2; ++mode_id) {
+ deblock_filter_levels[reference_frame][mode_id] = Clip3(
+ level +
+ LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
+ frame_header.loop_filter.mode_deltas[mode_id],
+ shift),
+ 0, kMaxLoopFilterValue);
+ }
+ }
+}
+
+} // namespace
+
+void PostFilter::ComputeDeblockFilterLevels(
+ const int8_t delta_lf[kFrameLfCount],
+ uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+ [kNumReferenceFrameTypes][2]) const {
+ if (!DoDeblock()) return;
+ for (int segment_id = 0;
+ segment_id < (frame_header_.segmentation.enabled ? kMaxSegments : 1);
+ ++segment_id) {
+ int level_index = 0;
+ for (; level_index < 2; ++level_index) {
+ ComputeDeblockFilterLevelsHelper(
+ frame_header_, segment_id, level_index, delta_lf,
+ deblock_filter_levels[segment_id][level_index]);
+ }
+ for (; level_index < kFrameLfCount; ++level_index) {
+ if (frame_header_.loop_filter.level[level_index] != 0) {
+ ComputeDeblockFilterLevelsHelper(
+ frame_header_, segment_id, level_index, delta_lf,
+ deblock_filter_levels[segment_id][level_index]);
+ }
+ }
+ }
+}
+
+bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+ uint8_t* level, int* step,
+ int* filter_length) const {
+ *step = kTransformHeight[inter_transform_sizes_[row4x4][column4x4]];
+ if (row4x4 == 0) return false;
+
+ const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+ const int row4x4_prev = row4x4 - 1;
+ assert(row4x4_prev >= 0);
+ const BlockParameters* bp_prev =
+ block_parameters_.Find(row4x4_prev, column4x4);
+
+ if (bp == bp_prev) {
+ // Not a border.
+ if (!NonBlockBorderNeedsFilter(*bp, 1, level)) return false;
+ } else {
+ const uint8_t level_this = bp->deblock_filter_level[1];
+ *level = level_this;
+ if (level_this == 0) {
+ const uint8_t level_prev = bp_prev->deblock_filter_level[1];
+ if (level_prev == 0) return false;
+ *level = level_prev;
+ }
+ }
+ const int step_prev =
+ kTransformHeight[inter_transform_sizes_[row4x4_prev][column4x4]];
+ *filter_length = std::min(*step, step_prev);
+ return true;
+}
+
+void PostFilter::GetHorizontalDeblockFilterEdgeInfoUV(
+ int row4x4, int column4x4, uint8_t* level_u, uint8_t* level_v, int* step,
+ int* filter_length) const {
+ const int subsampling_x = subsampling_x_[kPlaneU];
+ const int subsampling_y = subsampling_y_[kPlaneU];
+ row4x4 = GetDeblockPosition(row4x4, subsampling_y);
+ column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+ const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+ *level_u = 0;
+ *level_v = 0;
+ *step = kTransformHeight[bp->uv_transform_size];
+ if (row4x4 == subsampling_y) {
+ return;
+ }
+
+ bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+ bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+ assert(need_filter_u || need_filter_v);
+ const int filter_id_u =
+ kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeHorizontal];
+ const int filter_id_v =
+ kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeHorizontal];
+ const int row4x4_prev = row4x4 - (1 << subsampling_y);
+ assert(row4x4_prev >= 0);
+ const BlockParameters* bp_prev =
+ block_parameters_.Find(row4x4_prev, column4x4);
+
+ if (bp == bp_prev) {
+ // Not a border.
+ const bool skip = bp->skip && bp->is_inter;
+ need_filter_u =
+ need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+ need_filter_v =
+ need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+ if (!need_filter_u && !need_filter_v) return;
+ if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+ if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+ *filter_length = *step;
+ return;
+ }
+
+ // It is a border.
+ if (need_filter_u) {
+ const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+ *level_u = level_u_this;
+ if (level_u_this == 0) {
+ *level_u = bp_prev->deblock_filter_level[filter_id_u];
+ }
+ }
+ if (need_filter_v) {
+ const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+ *level_v = level_v_this;
+ if (level_v_this == 0) {
+ *level_v = bp_prev->deblock_filter_level[filter_id_v];
+ }
+ }
+ const int step_prev = kTransformHeight[bp_prev->uv_transform_size];
+ *filter_length = std::min(*step, step_prev);
+}
+
+bool PostFilter::GetVerticalDeblockFilterEdgeInfo(
+ int row4x4, int column4x4, BlockParameters* const* bp_ptr, uint8_t* level,
+ int* step, int* filter_length) const {
+ const BlockParameters* bp = *bp_ptr;
+ *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]];
+ if (column4x4 == 0) return false;
+
+ const int filter_id = 0;
+ const int column4x4_prev = column4x4 - 1;
+ assert(column4x4_prev >= 0);
+ const BlockParameters* bp_prev = *(bp_ptr - 1);
+ if (bp == bp_prev) {
+ // Not a border.
+ if (!NonBlockBorderNeedsFilter(*bp, filter_id, level)) return false;
+ } else {
+ // It is a border.
+ const uint8_t level_this = bp->deblock_filter_level[filter_id];
+ *level = level_this;
+ if (level_this == 0) {
+ const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+ if (level_prev == 0) return false;
+ *level = level_prev;
+ }
+ }
+ const int step_prev =
+ kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]];
+ *filter_length = std::min(*step, step_prev);
+ return true;
+}
+
+void PostFilter::GetVerticalDeblockFilterEdgeInfoUV(
+ int column4x4, BlockParameters* const* bp_ptr, uint8_t* level_u,
+ uint8_t* level_v, int* step, int* filter_length) const {
+ const int subsampling_x = subsampling_x_[kPlaneU];
+ column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+ const BlockParameters* bp = *bp_ptr;
+ *level_u = 0;
+ *level_v = 0;
+ *step = kTransformWidth[bp->uv_transform_size];
+ if (column4x4 == subsampling_x) {
+ return;
+ }
+
+ bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+ bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+ assert(need_filter_u || need_filter_v);
+ const int filter_id_u =
+ kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeVertical];
+ const int filter_id_v =
+ kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeVertical];
+ const BlockParameters* bp_prev = *(bp_ptr - (ptrdiff_t{1} << subsampling_x));
+
+ if (bp == bp_prev) {
+ // Not a border.
+ const bool skip = bp->skip && bp->is_inter;
+ need_filter_u =
+ need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+ need_filter_v =
+ need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+ if (!need_filter_u && !need_filter_v) return;
+ if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+ if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+ *filter_length = *step;
+ return;
+ }
+
+ // It is a border.
+ if (need_filter_u) {
+ const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+ *level_u = level_u_this;
+ if (level_u_this == 0) {
+ *level_u = bp_prev->deblock_filter_level[filter_id_u];
+ }
+ }
+ if (need_filter_v) {
+ const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+ *level_v = level_v_this;
+ if (level_v_this == 0) {
+ *level_v = bp_prev->deblock_filter_level[filter_id_v];
+ }
+ }
+ const int step_prev = kTransformWidth[bp_prev->uv_transform_size];
+ *filter_length = std::min(*step, step_prev);
+}
+
+void PostFilter::HorizontalDeblockFilter(int row4x4_start,
+ int column4x4_start) {
+ const int column_step = 1;
+ const int src_step = 4 << pixel_size_log2_;
+ const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+ uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+ int row_step;
+ uint8_t level;
+ int filter_length;
+
+ for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit &&
+ MultiplyBy4(column4x4_start + column4x4) < width_;
+ column4x4 += column_step, src += src_step) {
+ uint8_t* src_row = src;
+ for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
+ MultiplyBy4(row4x4_start + row4x4) < height_;
+ row4x4 += row_step) {
+ const bool need_filter = GetHorizontalDeblockFilterEdgeInfo(
+ row4x4_start + row4x4, column4x4_start + column4x4, &level, &row_step,
+ &filter_length);
+ if (need_filter) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+ dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+ src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+ HevThresh(level));
+ }
+ // TODO(chengchen): use shifts instead of multiplication.
+ src_row += row_step * src_stride;
+ row_step = DivideBy4(row_step);
+ }
+ }
+
+ if (needs_chroma_deblock_) {
+ const int8_t subsampling_x = subsampling_x_[kPlaneU];
+ const int8_t subsampling_y = subsampling_y_[kPlaneU];
+ const int column_step = 1 << subsampling_x;
+ const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+ const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+ uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+ uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+ int row_step;
+ uint8_t level_u;
+ uint8_t level_v;
+ int filter_length;
+
+ for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit &&
+ MultiplyBy4(column4x4_start + column4x4) < width_;
+ column4x4 += column_step, src_u += src_step, src_v += src_step) {
+ uint8_t* src_row_u = src_u;
+ uint8_t* src_row_v = src_v;
+ for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
+ MultiplyBy4(row4x4_start + row4x4) < height_;
+ row4x4 += row_step) {
+ GetHorizontalDeblockFilterEdgeInfoUV(
+ row4x4_start + row4x4, column4x4_start + column4x4, &level_u,
+ &level_v, &row_step, &filter_length);
+ if (level_u != 0) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+ dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+ src_row_u, src_stride_u, outer_thresh_[level_u],
+ inner_thresh_[level_u], HevThresh(level_u));
+ }
+ if (level_v != 0) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+ dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+ src_row_v, src_stride_v, outer_thresh_[level_v],
+ inner_thresh_[level_v], HevThresh(level_v));
+ }
+ src_row_u += row_step * src_stride_u;
+ src_row_v += row_step * src_stride_v;
+ row_step = DivideBy4(row_step << subsampling_y);
+ }
+ }
+ }
+}
+
+void PostFilter::VerticalDeblockFilter(int row4x4_start, int column4x4_start) {
+ const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(kPlaneY));
+ const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+ uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+ int column_step;
+ uint8_t level;
+ int filter_length;
+
+ BlockParameters* const* bp_row_base =
+ block_parameters_.Address(row4x4_start, column4x4_start);
+ const int bp_stride = block_parameters_.columns4x4();
+ const int column_step_shift = pixel_size_log2_;
+ for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
+ MultiplyBy4(row4x4_start + row4x4) < height_;
+ ++row4x4, src += row_stride, bp_row_base += bp_stride) {
+ uint8_t* src_row = src;
+ BlockParameters* const* bp = bp_row_base;
+ for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit &&
+ MultiplyBy4(column4x4_start + column4x4) < width_;
+ column4x4 += column_step, bp += column_step) {
+ const bool need_filter = GetVerticalDeblockFilterEdgeInfo(
+ row4x4_start + row4x4, column4x4_start + column4x4, bp, &level,
+ &column_step, &filter_length);
+ if (need_filter) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+ dsp_.loop_filters[size][kLoopFilterTypeVertical](
+ src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+ HevThresh(level));
+ }
+ src_row += column_step << column_step_shift;
+ column_step = DivideBy4(column_step);
+ }
+ }
+
+ if (needs_chroma_deblock_) {
+ const int8_t subsampling_x = subsampling_x_[kPlaneU];
+ const int8_t subsampling_y = subsampling_y_[kPlaneU];
+ const int row_step = 1 << subsampling_y;
+ uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+ uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+ const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+ const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+ const ptrdiff_t row_stride_u = MultiplyBy4(frame_buffer_.stride(kPlaneU));
+ const ptrdiff_t row_stride_v = MultiplyBy4(frame_buffer_.stride(kPlaneV));
+ const LoopFilterType type = kLoopFilterTypeVertical;
+ int column_step;
+ uint8_t level_u;
+ uint8_t level_v;
+ int filter_length;
+
+ BlockParameters* const* bp_row_base = block_parameters_.Address(
+ GetDeblockPosition(row4x4_start, subsampling_y),
+ GetDeblockPosition(column4x4_start, subsampling_x));
+ const int bp_stride = block_parameters_.columns4x4() << subsampling_y;
+ for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
+ MultiplyBy4(row4x4_start + row4x4) < height_;
+ row4x4 += row_step, src_u += row_stride_u, src_v += row_stride_v,
+ bp_row_base += bp_stride) {
+ uint8_t* src_row_u = src_u;
+ uint8_t* src_row_v = src_v;
+ BlockParameters* const* bp = bp_row_base;
+ for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit &&
+ MultiplyBy4(column4x4_start + column4x4) < width_;
+ column4x4 += column_step, bp += column_step) {
+ GetVerticalDeblockFilterEdgeInfoUV(column4x4_start + column4x4, bp,
+ &level_u, &level_v, &column_step,
+ &filter_length);
+ if (level_u != 0) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+ dsp_.loop_filters[size][type](
+ src_row_u, src_stride_u, outer_thresh_[level_u],
+ inner_thresh_[level_u], HevThresh(level_u));
+ }
+ if (level_v != 0) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+ dsp_.loop_filters[size][type](
+ src_row_v, src_stride_v, outer_thresh_[level_v],
+ inner_thresh_[level_v], HevThresh(level_v));
+ }
+ src_row_u += column_step << column_step_shift;
+ src_row_v += column_step << column_step_shift;
+ column_step = DivideBy4(column_step << subsampling_x);
+ }
+ }
+ }
+}
+
+void PostFilter::ApplyDeblockFilterForOneSuperBlockRow(int row4x4_start,
+ int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoDeblock());
+ for (int y = 0; y < sb4x4; y += 16) {
+ const int row4x4 = row4x4_start + y;
+ if (row4x4 >= frame_header_.rows4x4) break;
+ int column4x4;
+ for (column4x4 = 0; column4x4 < frame_header_.columns4x4;
+ column4x4 += kNum4x4InLoopFilterUnit) {
+ // First apply vertical filtering
+ VerticalDeblockFilter(row4x4, column4x4);
+
+ // Delay one superblock to apply horizontal filtering.
+ if (column4x4 != 0) {
+ HorizontalDeblockFilter(row4x4, column4x4 - kNum4x4InLoopFilterUnit);
+ }
+ }
+ // Horizontal filtering for the last 64x64 block.
+ HorizontalDeblockFilter(row4x4, column4x4 - kNum4x4InLoopFilterUnit);
+ }
+}
+
+template <LoopFilterType loop_filter_type>
+void PostFilter::DeblockFilterWorker(std::atomic<int>* row4x4_atomic) {
+ int row4x4;
+ while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopFilterUnit,
+ std::memory_order_relaxed)) <
+ frame_header_.rows4x4) {
+ for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
+ column4x4 += kNum4x4InLoopFilterUnit) {
+ (this->*deblock_filter_func_[loop_filter_type])(row4x4, column4x4);
+ }
+ }
+}
+
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>(
+ std::atomic<int>* row4x4_atomic);
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>(
+ std::atomic<int>* row4x4_atomic);
+
+void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
+ int row4x4_start, int column4x4_start,
+ int column4x4_end, int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoDeblock());
+
+ column4x4_end = std::min(column4x4_end, frame_header_.columns4x4);
+ if (column4x4_start >= column4x4_end) return;
+
+ const DeblockFilter deblock_filter = deblock_filter_func_[loop_filter_type];
+ const int sb_height4x4 =
+ std::min(sb4x4, frame_header_.rows4x4 - row4x4_start);
+ for (int y = 0; y < sb_height4x4; y += kNum4x4InLoopFilterUnit) {
+ const int row4x4 = row4x4_start + y;
+ for (int column4x4 = column4x4_start; column4x4 < column4x4_end;
+ column4x4 += kNum4x4InLoopFilterUnit) {
+ (this->*deblock_filter)(row4x4, column4x4);
+ }
+ }
+}
+
+} // namespace libgav1
diff --git a/src/post_filter/deblock_thresholds.inc b/src/post_filter/deblock_thresholds.inc
new file mode 100644
index 0000000..ca12aaa
--- /dev/null
+++ b/src/post_filter/deblock_thresholds.inc
@@ -0,0 +1,85 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Thresholds for the deblocking filter. Precomputed values of part of Section
+// 7.14.4 for all possible values of sharpness.
+
+constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = {
+ {1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
+
+constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = {
+ {5, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40,
+ 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118,
+ 121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157,
+ 160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 39, 41, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88,
+ 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114,
+ 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61,
+ 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87,
+ 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113,
+ 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86,
+ 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 33,
+ 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59,
+ 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85,
+ 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111,
+ 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31,
+ 33, 35, 37, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58,
+ 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84,
+ 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110,
+ 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57,
+ 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83,
+ 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109,
+ 111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56,
+ 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82,
+ 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108,
+ 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}};
diff --git a/src/post_filter/loop_restoration.cc b/src/post_filter/loop_restoration.cc
new file mode 100644
index 0000000..3d5da90
--- /dev/null
+++ b/src/post_filter/loop_restoration.cc
@@ -0,0 +1,172 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneRow(
+ const Pixel* src_buffer, const ptrdiff_t stride, const Plane plane,
+ const int plane_height, const int plane_width, const int unit_y,
+ const int unit_row, const int current_process_unit_height,
+ const int plane_unit_size, Pixel* dst_buffer) {
+ const int num_horizontal_units =
+ restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
+ const RestorationUnitInfo* const restoration_info =
+ restoration_info_->loop_restoration_info(static_cast<Plane>(plane),
+ unit_row * num_horizontal_units);
+ const bool in_place = DoCdef() || thread_pool_ != nullptr;
+ const Pixel* border = nullptr;
+ src_buffer += unit_y * stride;
+ if (in_place) {
+ assert(loop_restoration_border_.stride(plane) ==
+ static_cast<int>(sizeof(Pixel) * stride));
+ const int border_unit_y = std::max(
+ RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+ border =
+ reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
+ border_unit_y * stride;
+ }
+ int unit_column = 0;
+ int column = 0;
+ do {
+ const int current_process_unit_width =
+ std::min(plane_unit_size, plane_width - column);
+ const Pixel* src = src_buffer + column;
+ unit_column = std::min(unit_column, num_horizontal_units - 1);
+ if (restoration_info[unit_column].type == kLoopRestorationTypeNone) {
+ Pixel* dst = dst_buffer + column;
+ if (in_place) {
+ int k = current_process_unit_height;
+ do {
+ memmove(dst, src, current_process_unit_width * sizeof(Pixel));
+ src += stride;
+ dst += stride;
+ } while (--k != 0);
+ } else {
+ CopyPlane(src, stride, current_process_unit_width,
+ current_process_unit_height, dst, stride);
+ }
+ } else {
+ const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+ const Pixel* bottom_border = src + current_process_unit_height * stride;
+ const bool frame_bottom_border =
+ (unit_y + current_process_unit_height >= plane_height);
+ if (in_place && (unit_y != 0 || !frame_bottom_border)) {
+ const Pixel* loop_restoration_border = border + column;
+ if (unit_y != 0) {
+ top_border = loop_restoration_border;
+ loop_restoration_border += 4 * stride;
+ }
+ if (!frame_bottom_border) {
+ bottom_border =
+ loop_restoration_border + kRestorationVerticalBorder * stride;
+ }
+ }
+ RestorationBuffer restoration_buffer;
+ const LoopRestorationType type = restoration_info[unit_column].type;
+ assert(type == kLoopRestorationTypeSgrProj ||
+ type == kLoopRestorationTypeWiener);
+ const dsp::LoopRestorationFunc restoration_func =
+ dsp_.loop_restorations[type - 2];
+ restoration_func(restoration_info[unit_column], src, top_border,
+ bottom_border, stride, current_process_unit_width,
+ current_process_unit_height, &restoration_buffer,
+ dst_buffer + column);
+ }
+ ++unit_column;
+ column += plane_unit_size;
+ } while (column < plane_width);
+}
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(const int row4x4_start,
+ const int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoRestoration());
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+ const int unit_height_offset =
+ kRestorationUnitOffset >> subsampling_y_[plane];
+ const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+ const int plane_width =
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int plane_unit_size = 1 << loop_restoration_.unit_size_log2[plane];
+ const int plane_process_unit_height =
+ kRestorationUnitHeight >> subsampling_y_[plane];
+ int y = (row4x4_start == 0)
+ ? 0
+ : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) -
+ unit_height_offset;
+ int expected_height = plane_process_unit_height -
+ ((row4x4_start == 0) ? unit_height_offset : 0);
+ int current_process_unit_height;
+ for (int sb_y = 0; sb_y < sb4x4;
+ sb_y += 16, y += current_process_unit_height) {
+ if (y >= plane_height) break;
+ const int unit_row = std::min(
+ (y + unit_height_offset) >> loop_restoration_.unit_size_log2[plane],
+ restoration_info_->num_vertical_units(static_cast<Plane>(plane)) - 1);
+ current_process_unit_height = std::min(expected_height, plane_height - y);
+ expected_height = plane_process_unit_height;
+ ApplyLoopRestorationForOneRow<Pixel>(
+ reinterpret_cast<Pixel*>(superres_buffer_[plane]), stride,
+ static_cast<Plane>(plane), plane_height, plane_width, y, unit_row,
+ current_process_unit_height, plane_unit_size,
+ reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
+ y * stride);
+ }
+ } while (++plane < planes_);
+}
+
+void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(row4x4_start, sb4x4);
+ return;
+ }
+#endif
+ ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(row4x4_start, sb4x4);
+}
+
+void PostFilter::ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic) {
+ int row4x4;
+ // Loop Restoration operates with a lag of 8 rows (4 for chroma with
+ // subsampling) and hence we need to make sure to cover the last 8 rows of the
+ // last superblock row. So we run this loop for an extra iteration to
+ // accomplish that.
+ const int row4x4_end = frame_header_.rows4x4 + kNum4x4InLoopRestorationUnit;
+ while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopRestorationUnit,
+ std::memory_order_relaxed)) <
+ row4x4_end) {
+ CopyBordersForOneSuperBlockRow(row4x4, kNum4x4InLoopRestorationUnit,
+ /*for_loop_restoration=*/true);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(
+ row4x4, kNum4x4InLoopRestorationUnit);
+ continue;
+ }
+#endif
+ ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(
+ row4x4, kNum4x4InLoopRestorationUnit);
+ }
+}
+
+} // namespace libgav1
diff --git a/src/post_filter/post_filter.cc b/src/post_filter/post_filter.cc
new file mode 100644
index 0000000..0eacf34
--- /dev/null
+++ b/src/post_filter/post_filter.cc
@@ -0,0 +1,601 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/post_filter.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/post_filter/deblock_thresholds.inc"
+
+// Row indices of loop restoration border. This is used to populate the
+// |loop_restoration_border_| when either cdef is on or multithreading is
+// enabled. The dimension is subsampling_y.
+constexpr int kLoopRestorationBorderRows[2] = {54, 26};
+
+} // namespace
+
+// The following example illustrates how ExtendFrame() extends a frame.
+// Suppose the frame width is 8 and height is 4, and left, right, top, and
+// bottom are all equal to 3.
+//
+// Before:
+//
+// ABCDEFGH
+// IJKLMNOP
+// QRSTUVWX
+// YZabcdef
+//
+// After:
+//
+// AAA|ABCDEFGH|HHH [3]
+// AAA|ABCDEFGH|HHH
+// AAA|ABCDEFGH|HHH
+// ---+--------+---
+// AAA|ABCDEFGH|HHH [1]
+// III|IJKLMNOP|PPP
+// QQQ|QRSTUVWX|XXX
+// YYY|YZabcdef|fff
+// ---+--------+---
+// YYY|YZabcdef|fff [2]
+// YYY|YZabcdef|fff
+// YYY|YZabcdef|fff
+//
+// ExtendFrame() first extends the rows to the left and to the right[1]. Then
+// it copies the extended last row to the bottom borders[2]. Finally it copies
+// the extended first row to the top borders[3].
+// static
+template <typename Pixel>
+void PostFilter::ExtendFrame(Pixel* const frame_start, const int width,
+ const int height, const ptrdiff_t stride,
+ const int left, const int right, const int top,
+ const int bottom) {
+ Pixel* src = frame_start;
+ // Copy to left and right borders.
+ int y = height;
+ do {
+ ExtendLine<Pixel>(src, width, left, right);
+ src += stride;
+ } while (--y != 0);
+ // Copy to bottom borders. For performance we copy |stride| pixels
+ // (including some padding pixels potentially) in each row, ending at the
+ // bottom right border pixel. In the diagram the asterisks indicate padding
+ // pixels.
+ //
+ // |<--- stride --->|
+ // **YYY|YZabcdef|fff <-- Copy from the extended last row.
+ // -----+--------+---
+ // **YYY|YZabcdef|fff
+ // **YYY|YZabcdef|fff
+ // **YYY|YZabcdef|fff <-- bottom right border pixel
+ assert(src == frame_start + height * stride);
+ Pixel* dst = src - left;
+ src = dst - stride;
+ for (int y = 0; y < bottom; ++y) {
+ memcpy(dst, src, sizeof(Pixel) * stride);
+ dst += stride;
+ }
+ // Copy to top borders. For performance we copy |stride| pixels (including
+ // some padding pixels potentially) in each row, starting from the top left
+ // border pixel. In the diagram the asterisks indicate padding pixels.
+ //
+ // +-- top left border pixel
+ // |
+ // v
+ // AAA|ABCDEFGH|HHH**
+ // AAA|ABCDEFGH|HHH**
+ // AAA|ABCDEFGH|HHH**
+ // ---+--------+-----
+ // AAA|ABCDEFGH|HHH** <-- Copy from the extended first row.
+ // |<--- stride --->|
+ src = frame_start - left;
+ dst = frame_start - left - top * stride;
+ for (int y = 0; y < top; ++y) {
+ memcpy(dst, src, sizeof(Pixel) * stride);
+ dst += stride;
+ }
+}
+
+template void PostFilter::ExtendFrame<uint8_t>(uint8_t* const frame_start,
+ const int width,
+ const int height,
+ const ptrdiff_t stride,
+ const int left, const int right,
+ const int top, const int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void PostFilter::ExtendFrame<uint16_t>(
+ uint16_t* const frame_start, const int width, const int height,
+ const ptrdiff_t stride, const int left, const int right, const int top,
+ const int bottom);
+#endif
+
+PostFilter::PostFilter(const ObuFrameHeader& frame_header,
+ const ObuSequenceHeader& sequence_header,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ YuvBuffer* const frame_buffer, const dsp::Dsp* dsp,
+ int do_post_filter_mask)
+ : frame_header_(frame_header),
+ loop_restoration_(frame_header.loop_restoration),
+ dsp_(*dsp),
+ // Deblocking filter always uses 64x64 as step size.
+ num_64x64_blocks_per_row_(DivideBy64(frame_header.width + 63)),
+ upscaled_width_(frame_header.upscaled_width),
+ width_(frame_header.width),
+ height_(frame_header.height),
+ bitdepth_(sequence_header.color_config.bitdepth),
+ subsampling_x_{0, sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_x},
+ subsampling_y_{0, sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.subsampling_y},
+ planes_(sequence_header.color_config.is_monochrome ? kMaxPlanesMonochrome
+ : kMaxPlanes),
+ pixel_size_log2_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
+ : sizeof(uint16_t)) -
+ 1),
+ inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
+ outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
+ needs_chroma_deblock_(frame_header.loop_filter.level[kPlaneU + 1] != 0 ||
+ frame_header.loop_filter.level[kPlaneV + 1] != 0),
+ cdef_index_(frame_scratch_buffer->cdef_index),
+ inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+ restoration_info_(&frame_scratch_buffer->loop_restoration_info),
+ superres_coefficients_{
+ frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(),
+ frame_scratch_buffer
+ ->superres_coefficients
+ [(sequence_header.color_config.is_monochrome ||
+ sequence_header.color_config.subsampling_x == 0)
+ ? kPlaneTypeY
+ : kPlaneTypeUV]
+ .get()},
+ superres_line_buffer_(frame_scratch_buffer->superres_line_buffer),
+ block_parameters_(frame_scratch_buffer->block_parameters_holder),
+ frame_buffer_(*frame_buffer),
+ cdef_border_(frame_scratch_buffer->cdef_border),
+ loop_restoration_border_(frame_scratch_buffer->loop_restoration_border),
+ do_post_filter_mask_(do_post_filter_mask),
+ thread_pool_(
+ frame_scratch_buffer->threading_strategy.post_filter_thread_pool()) {
+ const int8_t zero_delta_lf[kFrameLfCount] = {};
+ ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
+ if (DoSuperRes()) {
+ int plane = kPlaneY;
+ do {
+ const int downscaled_width =
+ SubsampledValue(width_, subsampling_x_[plane]);
+ const int upscaled_width =
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int superres_width = downscaled_width << kSuperResScaleBits;
+ super_res_info_[plane].step =
+ (superres_width + upscaled_width / 2) / upscaled_width;
+ const int error =
+ super_res_info_[plane].step * upscaled_width - superres_width;
+ super_res_info_[plane].initial_subpixel_x =
+ ((-((upscaled_width - downscaled_width) << (kSuperResScaleBits - 1)) +
+ DivideBy2(upscaled_width)) /
+ upscaled_width +
+ (1 << (kSuperResExtraBits - 1)) - error / 2) &
+ kSuperResScaleMask;
+ super_res_info_[plane].upscaled_width = upscaled_width;
+ } while (++plane < planes_);
+ if (dsp->super_res_coefficients != nullptr) {
+ int plane = kPlaneY;
+ const int number_loops = (superres_coefficients_[kPlaneTypeY] ==
+ superres_coefficients_[kPlaneTypeUV])
+ ? kMaxPlanesMonochrome
+ : static_cast<int>(kNumPlaneTypes);
+ do {
+ dsp->super_res_coefficients(
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]),
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, superres_coefficients_[plane]);
+ } while (++plane < number_loops);
+ }
+ }
+ int plane = kPlaneY;
+ do {
+ loop_restoration_buffer_[plane] = frame_buffer_.data(plane);
+ cdef_buffer_[plane] = frame_buffer_.data(plane);
+ superres_buffer_[plane] = frame_buffer_.data(plane);
+ source_buffer_[plane] = frame_buffer_.data(plane);
+ } while (++plane < planes_);
+ if (DoCdef() || DoRestoration() || DoSuperRes()) {
+ plane = kPlaneY;
+ const int pixel_size_log2 = pixel_size_log2_;
+ do {
+ int horizontal_shift = 0;
+ int vertical_shift = 0;
+ if (DoRestoration() &&
+ loop_restoration_.type[plane] != kLoopRestorationTypeNone) {
+ horizontal_shift += frame_buffer_.alignment();
+ if (!DoCdef() && thread_pool_ == nullptr) {
+ vertical_shift += kRestorationVerticalBorder;
+ }
+ superres_buffer_[plane] +=
+ vertical_shift * frame_buffer_.stride(plane) +
+ (horizontal_shift << pixel_size_log2);
+ }
+ if (DoSuperRes()) {
+ vertical_shift += kSuperResVerticalBorder;
+ }
+ cdef_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+ (horizontal_shift << pixel_size_log2);
+ if (DoCdef() && thread_pool_ == nullptr) {
+ horizontal_shift += frame_buffer_.alignment();
+ vertical_shift += kCdefBorder;
+ }
+ assert(horizontal_shift <= frame_buffer_.right_border(plane));
+ assert(vertical_shift <= frame_buffer_.bottom_border(plane));
+ source_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+ (horizontal_shift << pixel_size_log2);
+ } while (++plane < planes_);
+ }
+}
+
+void PostFilter::ExtendFrameBoundary(uint8_t* const frame_start,
+ const int width, const int height,
+ const ptrdiff_t stride, const int left,
+ const int right, const int top,
+ const int bottom) const {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ExtendFrame<uint16_t>(reinterpret_cast<uint16_t*>(frame_start), width,
+ height, stride / sizeof(uint16_t), left, right, top,
+ bottom);
+ return;
+ }
+#endif
+ ExtendFrame<uint8_t>(frame_start, width, height, stride, left, right, top,
+ bottom);
+}
+
+void PostFilter::ExtendBordersForReferenceFrame() {
+ if (frame_header_.refresh_frame_flags == 0) return;
+ int plane = kPlaneY;
+ do {
+ const int plane_width =
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+ assert(frame_buffer_.left_border(plane) >= kMinLeftBorderPixels &&
+ frame_buffer_.right_border(plane) >= kMinRightBorderPixels &&
+ frame_buffer_.top_border(plane) >= kMinTopBorderPixels &&
+ frame_buffer_.bottom_border(plane) >= kMinBottomBorderPixels);
+ // plane subsampling_x_ left_border
+ // Y N/A 64, 48
+ // U,V 0 64, 48
+ // U,V 1 32, 16
+ assert(frame_buffer_.left_border(plane) >= 16);
+ // The |left| argument to ExtendFrameBoundary() must be at least
+ // kMinLeftBorderPixels (13) for warp.
+ static_assert(16 >= kMinLeftBorderPixels, "");
+ ExtendFrameBoundary(
+ frame_buffer_.data(plane), plane_width, plane_height,
+ frame_buffer_.stride(plane), frame_buffer_.left_border(plane),
+ frame_buffer_.right_border(plane), frame_buffer_.top_border(plane),
+ frame_buffer_.bottom_border(plane));
+ } while (++plane < planes_);
+}
+
+void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
+ assert(frame_buffer_.stride(plane) == loop_restoration_border_.stride(plane));
+ const ptrdiff_t stride = frame_buffer_.stride(plane);
+ const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
+ const int row_offset = DivideBy4(row4x4);
+ uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride;
+ const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
+ subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
+ int last_valid_row = -1;
+ const int plane_height =
+ SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+ int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ for (int i = 0; i < 4; ++i, ++row) {
+ if (absolute_row + i >= plane_height) {
+ if (last_valid_row == -1) break;
+ // If we run out of rows, copy the last valid row (mimics the bottom
+ // border extension).
+ row = last_valid_row;
+ }
+ memcpy(dst, src + row * stride, row_width);
+ last_valid_row = row;
+ dst += stride;
+ }
+}
+
+void PostFilter::CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+ bool for_loop_restoration) {
+ // Number of rows to be subtracted from the start position described by
+ // row4x4. We always lag by 8 rows (to account for in-loop post filters).
+ const int row_offset = (row4x4 == 0) ? 0 : 8;
+ // Number of rows to be subtracted from the height described by sb4x4.
+ const int height_offset = (row4x4 == 0) ? 8 : 0;
+ // If cdef is off and post filter multithreading is off, then loop restoration
+ // needs 2 extra rows for the bottom border in each plane.
+ const int extra_rows =
+ (for_loop_restoration && thread_pool_ == nullptr && !DoCdef()) ? 2 : 0;
+ int plane = kPlaneY;
+ do {
+ const int plane_width =
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+ const int row = (MultiplyBy4(row4x4) - row_offset) >> subsampling_y_[plane];
+ assert(row >= 0);
+ if (row >= plane_height) break;
+ const int num_rows =
+ std::min(SubsampledValue(MultiplyBy4(sb4x4) - height_offset,
+ subsampling_y_[plane]) +
+ extra_rows,
+ plane_height - row);
+ // We only need to track the progress of the Y plane since the progress of
+ // the U and V planes will be inferred from the progress of the Y plane.
+ if (!for_loop_restoration && plane == kPlaneY) {
+ progress_row_ = row + num_rows;
+ }
+ const bool copy_bottom = row + num_rows == plane_height;
+ const int stride = frame_buffer_.stride(plane);
+ uint8_t* const start = (for_loop_restoration ? superres_buffer_[plane]
+ : frame_buffer_.data(plane)) +
+ row * stride;
+ const int left_border = for_loop_restoration
+ ? kRestorationHorizontalBorder
+ : frame_buffer_.left_border(plane);
+ const int right_border = for_loop_restoration
+ ? kRestorationHorizontalBorder
+ : frame_buffer_.right_border(plane);
+ const int top_border =
+ (row == 0) ? (for_loop_restoration ? kRestorationVerticalBorder
+ : frame_buffer_.top_border(plane))
+ : 0;
+ const int bottom_border =
+ copy_bottom
+ ? (for_loop_restoration ? kRestorationVerticalBorder
+ : frame_buffer_.bottom_border(plane))
+ : 0;
+ ExtendFrameBoundary(start, plane_width, num_rows, stride, left_border,
+ right_border, top_border, bottom_border);
+ } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
+ assert(row4x4 >= 0);
+ assert(!DoCdef());
+ assert(DoRestoration());
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ assert(frame_buffer_.stride(plane) ==
+ loop_restoration_border_.stride(plane));
+ const ptrdiff_t stride = frame_buffer_.stride(plane);
+ const int row_offset = DivideBy4(row4x4);
+ const int num_pixels =
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
+ const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+ const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ const uint8_t* src =
+ GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) + row * stride;
+ uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride;
+ for (int i = 0; i < 4; ++i) {
+ memcpy(dst, src, row_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ExtendLine<uint16_t>(dst, num_pixels, kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ } else // NOLINT.
+#endif
+ ExtendLine<uint8_t>(dst, num_pixels, kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ // If we run out of rows, copy the last valid row (mimics the bottom
+ // border extension).
+ if (absolute_row + i < plane_height - 1) src += stride;
+ dst += stride;
+ }
+ } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoCdef());
+ assert(DoRestoration());
+ for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
+ const int row4x4 = row4x4_start + sb_y;
+ const int row_offset_start = DivideBy4(row4x4);
+ std::array<uint8_t*, kMaxPlanes> dst = {
+ loop_restoration_border_.data(kPlaneY) +
+ row_offset_start * loop_restoration_border_.stride(kPlaneY),
+ loop_restoration_border_.data(kPlaneU) +
+ row_offset_start * loop_restoration_border_.stride(kPlaneU),
+ loop_restoration_border_.data(kPlaneV) +
+ row_offset_start * loop_restoration_border_.stride(kPlaneV)};
+ // If SuperRes is enabled, then we apply SuperRes for the rows to be copied
+ // directly with |loop_restoration_border_| as the destination. Otherwise,
+ // we simply copy the rows.
+ if (DoSuperRes()) {
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<int, kMaxPlanes> rows;
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ rows[plane] = 0;
+ continue;
+ }
+ const int plane_height =
+ SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+ const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ src[plane] = GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * frame_buffer_.stride(plane);
+ rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
+ } while (++plane < planes_);
+ ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+ // If we run out of rows, copy the last valid row (mimics the bottom
+ // border extension).
+ plane = kPlaneY;
+ do {
+ if (rows[plane] == 0 || rows[plane] >= 4) continue;
+ const ptrdiff_t stride = frame_buffer_.stride(plane);
+ uint8_t* dst_line = dst[plane] + rows[plane] * stride;
+ const uint8_t* const src_line = dst_line - stride;
+ const int upscaled_width = super_res_info_[plane].upscaled_width
+ << pixel_size_log2_;
+ for (int i = rows[plane]; i < 4; ++i) {
+ memcpy(dst_line, src_line, upscaled_width);
+ dst_line += stride;
+ }
+ } while (++plane < planes_);
+ } else {
+ int plane = kPlaneY;
+ do {
+ CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
+ } while (++plane < planes_);
+ }
+ // Extend the left and right boundaries needed for loop restoration.
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ uint8_t* dst_line = dst[plane];
+ const int plane_width =
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ for (int i = 0; i < 4; ++i) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ExtendLine<uint16_t>(dst_line, plane_width,
+ kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ } else // NOLINT.
+#endif
+ {
+ ExtendLine<uint8_t>(dst_line, plane_width,
+ kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ }
+ dst_line += loop_restoration_border_.stride(plane);
+ }
+ } while (++plane < planes_);
+ }
+}
+
+void PostFilter::RunJobs(WorkerFunction worker) {
+ std::atomic<int> row4x4(0);
+ const int num_workers = thread_pool_->num_threads();
+ BlockingCounter pending_workers(num_workers);
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool_->Schedule([this, &row4x4, &pending_workers, worker]() {
+ (this->*worker)(&row4x4);
+ pending_workers.Decrement();
+ });
+ }
+ // Run the jobs on the current thread.
+ (this->*worker)(&row4x4);
+ // Wait for the threadpool jobs to finish.
+ pending_workers.Wait();
+}
+
+void PostFilter::ApplyFilteringThreaded() {
+ if (DoDeblock()) {
+ RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>);
+ RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>);
+ }
+ if (DoCdef() && DoRestoration()) {
+ for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+ row4x4 += kNum4x4InLoopFilterUnit) {
+ SetupLoopRestorationBorder(row4x4, kNum4x4InLoopFilterUnit);
+ }
+ }
+ if (DoCdef()) {
+ for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+ row4x4 += kNum4x4InLoopFilterUnit) {
+ SetupCdefBorder(row4x4);
+ }
+ RunJobs(&PostFilter::ApplyCdefWorker);
+ }
+ if (DoSuperRes()) ApplySuperResThreaded();
+ if (DoRestoration()) {
+ if (!DoCdef()) {
+ int row4x4 = 0;
+ do {
+ SetupLoopRestorationBorder(row4x4);
+ row4x4 += kNum4x4InLoopFilterUnit;
+ } while (row4x4 < frame_header_.rows4x4);
+ }
+ RunJobs(&PostFilter::ApplyLoopRestorationWorker);
+ }
+ ExtendBordersForReferenceFrame();
+}
+
+int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
+ bool is_last_row,
+ bool do_deblock) {
+ if (row4x4 < 0) return -1;
+ if (DoDeblock() && do_deblock) {
+ ApplyDeblockFilterForOneSuperBlockRow(row4x4, sb4x4);
+ }
+ if (DoRestoration() && DoCdef()) {
+ SetupLoopRestorationBorder(row4x4, sb4x4);
+ }
+ if (DoCdef()) {
+ ApplyCdefForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+ }
+ if (DoSuperRes()) {
+ ApplySuperResForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+ }
+ if (DoRestoration()) {
+ CopyBordersForOneSuperBlockRow(row4x4, sb4x4, true);
+ ApplyLoopRestoration(row4x4, sb4x4);
+ if (is_last_row) {
+ // Loop restoration operates with a lag of 8 rows. So make sure to cover
+ // all the rows of the last superblock row.
+ CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, true);
+ ApplyLoopRestoration(row4x4 + sb4x4, 16);
+ }
+ }
+ if (frame_header_.refresh_frame_flags != 0 && DoBorderExtensionInLoop()) {
+ CopyBordersForOneSuperBlockRow(row4x4, sb4x4, false);
+ if (is_last_row) {
+ CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, false);
+ }
+ }
+ if (is_last_row && !DoBorderExtensionInLoop()) {
+ ExtendBordersForReferenceFrame();
+ }
+ return is_last_row ? height_ : progress_row_;
+}
+
+} // namespace libgav1
diff --git a/src/post_filter/super_res.cc b/src/post_filter/super_res.cc
new file mode 100644
index 0000000..a70e4ed
--- /dev/null
+++ b/src/post_filter/super_res.cc
@@ -0,0 +1,199 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
+ const std::array<int, kMaxPlanes>& rows,
+ const int line_buffer_row,
+ const std::array<uint8_t*, kMaxPlanes>& dst) {
+ int plane = kPlaneY;
+ do {
+ const int plane_width =
+ MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ auto* input = reinterpret_cast<uint16_t*>(src[plane]);
+ auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
+ const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(uint16_t);
+ if (rows[plane] > 0) {
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ input, stride, rows[plane], plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output);
+ }
+ // In the multi-threaded case, the |superres_line_buffer_| holds the last
+ // input row. Apply SuperRes for that row.
+ if (line_buffer_row >= 0) {
+ auto* const line_buffer_start =
+ reinterpret_cast<uint16_t*>(superres_line_buffer_.data(plane)) +
+ line_buffer_row * superres_line_buffer_.stride(plane) /
+ sizeof(uint16_t) +
+ kSuperResHorizontalBorder;
+ dsp_.super_res(
+ superres_coefficients_[static_cast<int>(plane != 0)],
+ line_buffer_start, /*stride=*/0,
+ /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output + rows[plane] * stride);
+ }
+ continue;
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ uint8_t* input = src[plane];
+ uint8_t* output = dst[plane];
+ if (rows[plane] > 0) {
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ input, frame_buffer_.stride(plane), rows[plane],
+ plane_width, super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output);
+ }
+ // In the multi-threaded case, the |superres_line_buffer_| holds the last
+ // input row. Apply SuperRes for that row.
+ if (line_buffer_row >= 0) {
+ uint8_t* const line_buffer_start =
+ superres_line_buffer_.data(plane) +
+ line_buffer_row * superres_line_buffer_.stride(plane) +
+ kSuperResHorizontalBorder;
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ line_buffer_start, /*stride=*/0,
+ /*height=*/1, plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step,
+ output + rows[plane] * frame_buffer_.stride(plane));
+ }
+ } while (++plane < planes_);
+}
+
+void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4,
+ bool is_last_row) {
+ assert(row4x4_start >= 0);
+ assert(DoSuperRes());
+ // If not doing cdef, then LR needs two rows of border with superres applied.
+ const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2;
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<uint8_t*, kMaxPlanes> dst;
+ std::array<int, kMaxPlanes> rows;
+ const int num_rows4x4 =
+ std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) -
+ (is_last_row ? 0 : 2);
+ if (row4x4_start > 0) {
+ const int row4x4 = row4x4_start - 2;
+ int plane = kPlaneY;
+ do {
+ const int row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
+ const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
+ src[plane] = cdef_buffer_[plane] + row_offset;
+ dst[plane] = superres_buffer_[plane] + row_offset;
+ // Note that the |num_rows_extra| subtraction is done after the value is
+ // subsampled since we always need to work on |num_rows_extra| extra rows
+ // irrespective of the plane subsampling.
+ // Apply superres for the last 8-|num_rows_extra| rows of the previous
+ // superblock.
+ rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
+ // Apply superres for the current superblock row (except for the last
+ // 8-|num_rows_extra| rows).
+ rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+ (is_last_row ? 0 : num_rows_extra);
+ } while (++plane < planes_);
+ } else {
+ // Apply superres for the current superblock row (except for the last
+ // 8-|num_rows_extra| rows).
+ int plane = kPlaneY;
+ do {
+ const ptrdiff_t row_offset =
+ (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
+ frame_buffer_.stride(plane);
+ src[plane] = cdef_buffer_[plane] + row_offset;
+ dst[plane] = superres_buffer_[plane] + row_offset;
+ // Note that the |num_rows_extra| addition is done after the value is
+ // subsampled since we always need to work on |num_rows_extra| extra rows
+ // irrespective of the plane subsampling.
+ rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+ (is_last_row ? 0 : num_rows_extra);
+ } while (++plane < planes_);
+ }
+ ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+}
+
+void PostFilter::ApplySuperResThreaded() {
+ int num_threads = thread_pool_->num_threads() + 1;
+ // The number of rows that will be processed by each thread in the thread pool
+ // (other than the current thread).
+ int thread_pool_rows = height_ / num_threads;
+ thread_pool_rows = std::max(thread_pool_rows, 1);
+ // Make rows of Y plane even when there is subsampling for the other planes.
+ if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+ ++thread_pool_rows;
+ }
+ // Adjust the number of threads to what we really need.
+ num_threads = Clip3(height_ / thread_pool_rows, 1, num_threads);
+ // For the current thread, we round up to process all the remaining rows.
+ int current_thread_rows = height_ - thread_pool_rows * (num_threads - 1);
+ // Make rows of Y plane even when there is subsampling for the other planes.
+ if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+ ++current_thread_rows;
+ }
+ assert(current_thread_rows > 0);
+ BlockingCounter pending_workers(num_threads - 1);
+ for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads;
+ ++line_buffer_row, row_start += thread_pool_rows) {
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<uint8_t*, kMaxPlanes> dst;
+ std::array<int, kMaxPlanes> rows;
+ int plane = kPlaneY;
+ const int pixel_size_log2 = pixel_size_log2_;
+ do {
+ src[plane] =
+ GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+ static_cast<Plane>(plane), row_start, 0);
+ dst[plane] =
+ GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+ static_cast<Plane>(plane), row_start, 0);
+ rows[plane] =
+ (((line_buffer_row < num_threads - 1) ? thread_pool_rows
+ : current_thread_rows) >>
+ subsampling_y_[plane]) -
+ 1;
+ const int plane_width =
+ MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+ uint8_t* const input =
+ src[plane] + rows[plane] * frame_buffer_.stride(plane);
+ uint8_t* const line_buffer_start =
+ superres_line_buffer_.data(plane) +
+ line_buffer_row * superres_line_buffer_.stride(plane) +
+ (kSuperResHorizontalBorder << pixel_size_log2);
+ memcpy(line_buffer_start, input, plane_width << pixel_size_log2);
+ } while (++plane < planes_);
+ if (line_buffer_row < num_threads - 1) {
+ thread_pool_->Schedule(
+ [this, src, rows, line_buffer_row, dst, &pending_workers]() {
+ ApplySuperRes(src, rows, line_buffer_row, dst);
+ pending_workers.Decrement();
+ });
+ } else {
+ ApplySuperRes(src, rows, line_buffer_row, dst);
+ }
+ }
+ // Wait for the threadpool jobs to finish.
+ pending_workers.Wait();
+}
+
+} // namespace libgav1
diff --git a/src/prediction_mask.cc b/src/prediction_mask.cc
new file mode 100644
index 0000000..ab4d849
--- /dev/null
+++ b/src/prediction_mask.cc
@@ -0,0 +1,236 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/prediction_mask.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWedgeDirectionTypes = 16;
+
+enum kWedgeDirection : uint8_t {
+ kWedgeHorizontal,
+ kWedgeVertical,
+ kWedgeOblique27,
+ kWedgeOblique63,
+ kWedgeOblique117,
+ kWedgeOblique153,
+};
+
+constexpr uint8_t kWedgeCodebook[3][16][3] = {{{kWedgeOblique27, 4, 4},
+ {kWedgeOblique63, 4, 4},
+ {kWedgeOblique117, 4, 4},
+ {kWedgeOblique153, 4, 4},
+ {kWedgeHorizontal, 4, 2},
+ {kWedgeHorizontal, 4, 4},
+ {kWedgeHorizontal, 4, 6},
+ {kWedgeVertical, 4, 4},
+ {kWedgeOblique27, 4, 2},
+ {kWedgeOblique27, 4, 6},
+ {kWedgeOblique153, 4, 2},
+ {kWedgeOblique153, 4, 6},
+ {kWedgeOblique63, 2, 4},
+ {kWedgeOblique63, 6, 4},
+ {kWedgeOblique117, 2, 4},
+ {kWedgeOblique117, 6, 4}},
+ {{kWedgeOblique27, 4, 4},
+ {kWedgeOblique63, 4, 4},
+ {kWedgeOblique117, 4, 4},
+ {kWedgeOblique153, 4, 4},
+ {kWedgeVertical, 2, 4},
+ {kWedgeVertical, 4, 4},
+ {kWedgeVertical, 6, 4},
+ {kWedgeHorizontal, 4, 4},
+ {kWedgeOblique27, 4, 2},
+ {kWedgeOblique27, 4, 6},
+ {kWedgeOblique153, 4, 2},
+ {kWedgeOblique153, 4, 6},
+ {kWedgeOblique63, 2, 4},
+ {kWedgeOblique63, 6, 4},
+ {kWedgeOblique117, 2, 4},
+ {kWedgeOblique117, 6, 4}},
+ {{kWedgeOblique27, 4, 4},
+ {kWedgeOblique63, 4, 4},
+ {kWedgeOblique117, 4, 4},
+ {kWedgeOblique153, 4, 4},
+ {kWedgeHorizontal, 4, 2},
+ {kWedgeHorizontal, 4, 6},
+ {kWedgeVertical, 2, 4},
+ {kWedgeVertical, 6, 4},
+ {kWedgeOblique27, 4, 2},
+ {kWedgeOblique27, 4, 6},
+ {kWedgeOblique153, 4, 2},
+ {kWedgeOblique153, 4, 6},
+ {kWedgeOblique63, 2, 4},
+ {kWedgeOblique63, 6, 4},
+ {kWedgeOblique117, 2, 4},
+ {kWedgeOblique117, 6, 4}}};
+
+constexpr BitMaskSet kWedgeFlipSignMasks[9] = {
+ BitMaskSet(0xBBFF), // kBlock8x8
+ BitMaskSet(0xBBEF), // kBlock8x16
+ BitMaskSet(0xBAEF), // kBlock8x32
+ BitMaskSet(0xBBEF), // kBlock16x8
+ BitMaskSet(0xBBFF), // kBlock16x16
+ BitMaskSet(0xBBEF), // kBlock16x32
+ BitMaskSet(0xABEF), // kBlock32x8
+ BitMaskSet(0xBBEF), // kBlock32x16
+ BitMaskSet(0xBBFF) // kBlock32x32
+};
+
+// This table (and the one below) contains a few leading zeros and trailing 64s
+// to avoid some additional memcpys where it is actually used.
+constexpr uint8_t kWedgeMasterObliqueOdd[kWedgeMaskMasterSize * 3 / 2] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, 37,
+ 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterObliqueEven[kWedgeMaskMasterSize * 3 / 2] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27,
+ 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterVertical[kWedgeMaskMasterSize] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21,
+ 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+int BlockShape(BlockSize block_size) {
+ const int width = kNum4x4BlocksWide[block_size];
+ const int height = kNum4x4BlocksHigh[block_size];
+ if (height > width) return 0;
+ if (height < width) return 1;
+ return 2;
+}
+
+uint8_t GetWedgeDirection(BlockSize block_size, int index) {
+ return kWedgeCodebook[BlockShape(block_size)][index][0];
+}
+
+uint8_t GetWedgeOffsetX(BlockSize block_size, int index) {
+ return kWedgeCodebook[BlockShape(block_size)][index][1];
+}
+
+uint8_t GetWedgeOffsetY(BlockSize block_size, int index) {
+ return kWedgeCodebook[BlockShape(block_size)][index][2];
+}
+
+} // namespace
+
+bool GenerateWedgeMask(WedgeMaskArray* const wedge_masks) {
+ // Generate master masks.
+ uint8_t master_mask[6][kWedgeMaskMasterSize][kWedgeMaskMasterSize];
+ for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+ memcpy(master_mask[kWedgeVertical][y], kWedgeMasterVertical,
+ kWedgeMaskMasterSize);
+ }
+
+ for (int y = 0, shift = 0; y < kWedgeMaskMasterSize; y += 2, ++shift) {
+ memcpy(master_mask[kWedgeOblique63][y], kWedgeMasterObliqueEven + shift,
+ kWedgeMaskMasterSize);
+ memcpy(master_mask[kWedgeOblique63][y + 1], kWedgeMasterObliqueOdd + shift,
+ kWedgeMaskMasterSize);
+ }
+
+ for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+ for (int x = 0; x < kWedgeMaskMasterSize; ++x) {
+ const uint8_t mask_value = master_mask[kWedgeOblique63][y][x];
+ master_mask[kWedgeHorizontal][x][y] = master_mask[kWedgeVertical][y][x];
+ master_mask[kWedgeOblique27][x][y] = mask_value;
+ master_mask[kWedgeOblique117][y][kWedgeMaskMasterSize - 1 - x] =
+ 64 - mask_value;
+ master_mask[kWedgeOblique153][(kWedgeMaskMasterSize - 1 - x)][y] =
+ 64 - mask_value;
+ }
+ }
+
+ // Generate wedge masks.
+ int block_size_index = 0;
+ for (int size = kBlock8x8; size <= kBlock32x32; ++size) {
+ if (!kIsWedgeCompoundModeAllowed.Contains(size)) continue;
+
+ const int width = kBlockWidthPixels[size];
+ const int height = kBlockHeightPixels[size];
+ assert(width >= 8);
+ assert(width <= 32);
+ assert(height >= 8);
+ assert(height <= 32);
+
+ const auto block_size = static_cast<BlockSize>(size);
+ for (int wedge_index = 0; wedge_index < kWedgeDirectionTypes;
+ ++wedge_index) {
+ const uint8_t direction = GetWedgeDirection(block_size, wedge_index);
+ const uint8_t offset_x =
+ DivideBy2(kWedgeMaskMasterSize) -
+ ((GetWedgeOffsetX(block_size, wedge_index) * width) >> 3);
+ const uint8_t offset_y =
+ DivideBy2(kWedgeMaskMasterSize) -
+ ((GetWedgeOffsetY(block_size, wedge_index) * height) >> 3);
+
+ // Allocate the 2d array.
+ for (int flip_sign = 0; flip_sign < 2; ++flip_sign) {
+ if (!((*wedge_masks)[block_size_index][flip_sign][wedge_index].Reset(
+ height, width, /*zero_initialize=*/false))) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for wedge masks.");
+ return false;
+ }
+ }
+
+ const auto flip_sign = static_cast<uint8_t>(
+ kWedgeFlipSignMasks[block_size_index].Contains(wedge_index));
+ uint8_t* wedge_masks_row =
+ (*wedge_masks)[block_size_index][flip_sign][wedge_index][0];
+ uint8_t* wedge_masks_row_flip =
+ (*wedge_masks)[block_size_index][1 - flip_sign][wedge_index][0];
+ uint8_t* master_mask_row = &master_mask[direction][offset_y][offset_x];
+ for (int y = 0; y < height; ++y) {
+ memcpy(wedge_masks_row, master_mask_row, width);
+ for (int x = 0; x < width; ++x) {
+ wedge_masks_row_flip[x] = 64 - wedge_masks_row[x];
+ }
+ wedge_masks_row += width;
+ wedge_masks_row_flip += width;
+ master_mask_row += kWedgeMaskMasterSize;
+ }
+ }
+
+ block_size_index++;
+ }
+ return true;
+}
+
+} // namespace libgav1
diff --git a/src/prediction_mask.h b/src/prediction_mask.h
new file mode 100644
index 0000000..0134a0d
--- /dev/null
+++ b/src/prediction_mask.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_PREDICTION_MASK_H_
+#define LIBGAV1_SRC_PREDICTION_MASK_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr BitMaskSet kIsWedgeCompoundModeAllowed(kBlock8x8, kBlock8x16,
+ kBlock8x32, kBlock16x8,
+ kBlock16x16, kBlock16x32,
+ kBlock32x8, kBlock32x16,
+ kBlock32x32);
+
+// This function generates wedge masks. It should be called only once for the
+// decoder. If the video is key frame only, we don't have to call this
+// function. Returns true on success, false on allocation failure.
+// 7.11.3.11.
+bool GenerateWedgeMask(WedgeMaskArray* wedge_masks);
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_PREDICTION_MASK_H_
diff --git a/src/quantizer.cc b/src/quantizer.cc
new file mode 100644
index 0000000..cd720d6
--- /dev/null
+++ b/src/quantizer.cc
@@ -0,0 +1,269 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/quantizer.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10
+#error LIBGAV1_MAX_BITDEPTH must be 8 or 10
+#endif
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/quantizer_tables.inc"
+
+// Format the kDcLookup and kAcLookup arrays manually for easier comparison
+// with the Dc_Qlookup and Ac_Qlookup arrays in Section 7.12.2.
+
+// clang-format off
+constexpr int16_t kDcLookup[][256] = {
+ // Lookup table for 8 bit.
+ {
+ 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16,
+ 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26,
+ 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37,
+ 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47,
+ 48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57,
+ 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66,
+ 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76,
+ 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85,
+ 87, 88, 90, 92, 93, 95, 96, 98, 99, 101, 102, 104,
+ 105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121,
+ 123, 125, 127, 129, 131, 134, 136, 138, 140, 142, 144, 146,
+ 148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174,
+ 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, 205, 208,
+ 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247,
+ 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292,
+ 296, 300, 304, 309, 313, 317, 322, 326, 330, 335, 340, 344,
+ 349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406,
+ 411, 417, 423, 429, 435, 441, 447, 454, 461, 467, 475, 482,
+ 489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590,
+ 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, 755, 775,
+ 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+ 1184, 1232, 1282, 1336
+ },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // Lookup table for 10 bit.
+ {
+ 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34,
+ 37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75,
+ 78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120,
+ 124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, 166,
+ 170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212,
+ 215, 219, 223, 226, 230, 233, 237, 241, 244, 248, 251, 255,
+ 259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297,
+ 300, 304, 307, 310, 314, 317, 321, 324, 327, 331, 334, 337,
+ 343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412,
+ 418, 424, 430, 436, 442, 448, 454, 460, 466, 472, 478, 484,
+ 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, 584,
+ 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698,
+ 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831,
+ 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988,
+ 1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170,
+ 1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379,
+ 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624,
+ 1647, 1670, 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929,
+ 1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, 2236, 2276, 2319, 2363,
+ 2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102,
+ 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559,
+ 4737, 4929, 5130, 5347
+ },
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+};
+
+constexpr int16_t kAcLookup[][256] = {
+ // Lookup table for 8 bit.
+ {
+ 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+ 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
+ 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+ 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+ 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
+ 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, 182, 185,
+ 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227,
+ 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280,
+ 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347,
+ 353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432,
+ 440, 448, 456, 465, 474, 483, 492, 501, 510, 520, 530, 540,
+ 550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676,
+ 689, 702, 715, 729, 743, 757, 771, 786, 801, 816, 832, 848,
+ 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066,
+ 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+ 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692,
+ 1725, 1759, 1793, 1828
+ },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // Lookup table for 10 bit.
+ {
+ 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37,
+ 40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83,
+ 88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136,
+ 140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, 190,
+ 195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244,
+ 249, 253, 258, 262, 267, 271, 275, 280, 284, 289, 293, 297,
+ 302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349,
+ 354, 358, 362, 367, 371, 375, 379, 384, 388, 392, 396, 401,
+ 409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498,
+ 506, 514, 523, 531, 539, 547, 555, 563, 571, 579, 588, 596,
+ 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, 737,
+ 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905,
+ 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118,
+ 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386,
+ 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727,
+ 1759, 1791, 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159,
+ 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703,
+ 2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391,
+ 3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264,
+ 4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372,
+ 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768,
+ 6900, 7036, 7172, 7312
+ },
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+};
+// clang-format on
+
+void Transpose(uint8_t* const dst, const uint8_t* const src, int src_width,
+ int src_height) {
+ const int dst_width = src_height;
+ const int dst_height = src_width;
+ Array2DView<const uint8_t> source(src_height, src_width, src);
+ Array2DView<uint8_t> dest(dst_height, dst_width, dst);
+ for (int y = 0; y < dst_height; ++y) {
+ for (int x = 0; x < dst_width; ++x) {
+ dest[y][x] = source[x][y];
+ }
+ }
+}
+
+// Copies the lower triangle and fills the upper triangle of |dst| using |src|
+// as the source.
+void FillUpperTriangle(uint8_t* dst, const uint8_t* src, int size) {
+ Array2DView<uint8_t> dest(size, size, dst);
+ int k = 0;
+ for (int y = 0; y < size; ++y) {
+ for (int x = 0; x <= y; ++x) {
+ dest[y][x] = dest[x][y] = src[k++];
+ }
+ }
+}
+
+} // namespace
+
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix_ptr) {
+ for (int level = 0; level < kNumQuantizerLevelsForQuantizerMatrix; ++level) {
+ for (int plane_type = kPlaneTypeY; plane_type < kNumPlaneTypes;
+ ++plane_type) {
+ auto& quantizer_matrix = (*quantizer_matrix_ptr)[level][plane_type];
+ // Notes about how these matrices are populated:
+ // * For square transforms, we store only the lower left triangle (it is
+ // symmetric about the main diagonal. So when populating the matrix, we
+ // will have to fill in the upper right triangle.
+ // * For rectangular transforms, the matrices are transposes when the
+ // width and height are reversed. So when populating we populate it with
+ // memcpy when w < h and populate it by transposing when w > h.
+ // * There is a special case for 16x16 where the matrix is the same as
+ // 32x32 with some offsets.
+ // * We use the "adjusted transform size" when using these matrices, so we
+ // won't have to populate them for transform sizes with one of the
+ // dimensions equal to 64.
+ for (int tx_size = 0; tx_size < kNumTransformSizes; ++tx_size) {
+ if (kTransformWidth[tx_size] == 64 || kTransformHeight[tx_size] == 64) {
+ continue;
+ }
+ const int size = kTransformWidth[tx_size] * kTransformHeight[tx_size];
+ if (!quantizer_matrix[tx_size].Resize(size)) {
+ return false;
+ }
+ }
+#define QUANTIZER_MEMCPY(W, H) \
+ memcpy(quantizer_matrix[kTransformSize##W##x##H].get(), \
+ kQuantizerMatrix##W##x##H[level][plane_type], (W) * (H))
+#define QUANTIZER_TRANSPOSE(W, H) \
+ Transpose(quantizer_matrix[kTransformSize##W##x##H].get(), \
+ kQuantizerMatrix##H##x##W[level][plane_type], H, W)
+#define QUANTIZER_FILL_UPPER_TRIANGLE(SIZE) \
+ FillUpperTriangle(quantizer_matrix[kTransformSize##SIZE##x##SIZE].get(), \
+ kQuantizerMatrix##SIZE##x##SIZE[level][plane_type], SIZE)
+ QUANTIZER_FILL_UPPER_TRIANGLE(4); // 4x4
+ QUANTIZER_MEMCPY(4, 8); // 4x8
+ QUANTIZER_MEMCPY(4, 16); // 4x16
+ QUANTIZER_TRANSPOSE(8, 4); // 8x4
+ QUANTIZER_FILL_UPPER_TRIANGLE(8); // 8x8
+ QUANTIZER_MEMCPY(8, 16); // 8x16
+ QUANTIZER_MEMCPY(8, 32); // 8x32
+ QUANTIZER_TRANSPOSE(16, 4); // 16x4
+ QUANTIZER_TRANSPOSE(16, 8); // 16x8
+ QUANTIZER_MEMCPY(16, 32); // 16x32
+ QUANTIZER_TRANSPOSE(32, 8); // 32x8
+ QUANTIZER_TRANSPOSE(32, 16); // 32x16
+ QUANTIZER_FILL_UPPER_TRIANGLE(32); // 32x32
+ // 16x16.
+ Array2DView<uint8_t> dst16x16(
+ 16, 16, quantizer_matrix[kTransformSize16x16].get());
+ Array2DView<const uint8_t> src32x32(
+ 32, 32, quantizer_matrix[kTransformSize32x32].get());
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ dst16x16[y][x] = src32x32[MultiplyBy2(y)][MultiplyBy2(x)];
+ }
+ }
+#undef QUANTIZER_FILL_UPPER_TRIANGLE
+#undef QUANTIZER_TRANSPOSE
+#undef QUANTIZER_MEMCPY
+ }
+ }
+ return true;
+}
+
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex) {
+ if (segmentation.FeatureActive(index, kSegmentFeatureQuantizer)) {
+ const int segment_qindex =
+ base_qindex +
+ segmentation.feature_data[index][kSegmentFeatureQuantizer];
+ return Clip3(segment_qindex, kMinQuantizer, kMaxQuantizer);
+ }
+ return base_qindex;
+}
+
+Quantizer::Quantizer(int bitdepth, const QuantizerParameters* params)
+ : params_(*params) {
+ assert(bitdepth >= 8 && bitdepth <= LIBGAV1_MAX_BITDEPTH);
+ const int index = BitdepthToArrayIndex(bitdepth);
+ dc_lookup_ = kDcLookup[index];
+ ac_lookup_ = kAcLookup[index];
+}
+
+int Quantizer::GetDcValue(Plane plane, int qindex) const {
+ return dc_lookup_[Clip3(qindex + params_.delta_dc[plane], kMinQuantizer,
+ kMaxQuantizer)];
+}
+
+int Quantizer::GetAcValue(Plane plane, int qindex) const {
+ return ac_lookup_[Clip3(qindex + params_.delta_ac[plane], kMinQuantizer,
+ kMaxQuantizer)];
+}
+
+} // namespace libgav1
diff --git a/src/quantizer.h b/src/quantizer.h
new file mode 100644
index 0000000..00c53ab
--- /dev/null
+++ b/src/quantizer.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_QUANTIZER_H_
+#define LIBGAV1_SRC_QUANTIZER_H_
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+using QuantizerMatrix = std::array<
+ std::array<std::array<DynamicBuffer<uint8_t>, kNumTransformSizes>,
+ kNumPlaneTypes>,
+ kNumQuantizerLevelsForQuantizerMatrix>;
+
+// Implements the dequantization functions of Section 7.12.2.
+class Quantizer {
+ public:
+ Quantizer(int bitdepth, const QuantizerParameters* params);
+
+ // Returns the quantizer value for the dc coefficient for the given plane.
+ // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+ // the |base_qindex| argument, and pass the return value as the |qindex|
+ // argument to this method.
+ int GetDcValue(Plane plane, int qindex) const;
+
+ // Returns the quantizer value for the ac coefficient for the given plane.
+ // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+ // the |base_qindex| argument, and pass the return value as the |qindex|
+ // argument to this method.
+ int GetAcValue(Plane plane, int qindex) const;
+
+ private:
+ const QuantizerParameters& params_;
+ const int16_t* dc_lookup_;
+ const int16_t* ac_lookup_;
+};
+
+// Initialize the quantizer matrix.
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix);
+
+// Get the quantizer index for the |index|th segment.
+//
+// This function has two use cases. What should be passed as the |base_qindex|
+// argument depends on the use case.
+// 1. While parsing the uncompressed header or transform type, pass
+// Quantizer::base_index.
+// Note: In this use case, the caller only cares about whether the return
+// value is zero.
+// 2. To generate the |qindex| argument to Quantizer::GetDcQuant() or
+// Quantizer::GetAcQuant(), pass Tile::current_quantizer_index_.
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_QUANTIZER_H_
diff --git a/src/quantizer_tables.inc b/src/quantizer_tables.inc
new file mode 100644
index 0000000..34342c4
--- /dev/null
+++ b/src/quantizer_tables.inc
@@ -0,0 +1,3080 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the quantizer table
+// definitions from the quantizer functions.
+
+constexpr uint8_t kQuantizerMatrix4x8
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][32] = {
+ {{32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84,
+ 91, 49, 71, 103, 110, 65, 84, 125, 128, 80, 97,
+ 142, 152, 91, 100, 145, 178, 104, 112, 146, 190},
+ {31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73,
+ 54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105}},
+ {{32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77,
+ 88, 46, 67, 93, 105, 60, 79, 112, 122, 75, 92,
+ 130, 144, 86, 95, 136, 167, 98, 105, 136, 177},
+ {31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72,
+ 52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102}},
+ {{32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68,
+ 85, 44, 61, 85, 101, 54, 69, 98, 117, 72, 84,
+ 118, 136, 82, 89, 129, 157, 92, 98, 127, 165},
+ {31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71,
+ 50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99}},
+ {{32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65,
+ 82, 41, 53, 78, 97, 51, 61, 92, 111, 65, 73,
+ 108, 129, 75, 81, 117, 148, 86, 92, 119, 154},
+ {31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70,
+ 49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95}},
+ {{32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87,
+ 48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144},
+ {31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65,
+ 47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93}},
+ {{32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84,
+ 45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136},
+ {31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64,
+ 46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90}},
+ {{32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68,
+ 41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111},
+ {31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57,
+ 45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79}},
+ {{32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59,
+ 38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97},
+ {31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53,
+ 46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73}},
+ {{32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54,
+ 35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83},
+ {31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50,
+ 47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67}},
+ {{31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42,
+ 34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67},
+ {31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45,
+ 43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59}},
+ {{31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40,
+ 33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56},
+ {31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46,
+ 40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55}},
+ {{31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36,
+ 32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48},
+ {31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47,
+ 37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53}},
+ {{31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36},
+ {31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40,
+ 34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47}},
+ {{31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+ {31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36,
+ 31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix4x16
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][64] = {
+ {{31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34,
+ 48, 73, 83, 34, 54, 78, 89, 41, 63, 90, 95, 45, 67,
+ 96, 102, 54, 75, 110, 111, 60, 79, 118, 123, 72, 90, 133,
+ 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100, 140, 173,
+ 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197},
+ {31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60,
+ 46, 53, 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74,
+ 52, 64, 82, 80, 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96,
+ 63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107}},
+ {{31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33,
+ 44, 66, 81, 34, 54, 74, 86, 37, 58, 79, 92, 44, 66,
+ 90, 98, 49, 71, 99, 107, 56, 77, 107, 117, 65, 84, 119,
+ 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132, 163,
+ 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183},
+ {31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59,
+ 46, 53, 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73,
+ 51, 63, 77, 78, 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93,
+ 62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103}},
+ {{31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33,
+ 40, 58, 78, 34, 47, 65, 83, 37, 54, 73, 89, 41, 58,
+ 79, 94, 46, 62, 86, 102, 53, 68, 97, 112, 60, 73, 105,
+ 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125, 154,
+ 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170},
+ {31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58,
+ 44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71,
+ 49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91,
+ 60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100}},
+ {{31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33,
+ 38, 56, 76, 34, 42, 61, 81, 34, 48, 66, 85, 39, 51,
+ 74, 91, 44, 56, 82, 98, 49, 60, 90, 107, 54, 63, 95,
+ 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145,
+ 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159},
+ {31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57,
+ 43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70,
+ 48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89,
+ 58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97}},
+ {{31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32,
+ 37, 49, 71, 33, 41, 53, 74, 34, 48, 60, 80, 37, 50,
+ 65, 85, 41, 53, 71, 91, 45, 56, 76, 98, 49, 60, 82,
+ 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130,
+ 74, 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148},
+ {31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56,
+ 42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68,
+ 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83,
+ 56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94}},
+ {{31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71,
+ 33, 38, 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84,
+ 41, 53, 71, 90, 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110,
+ 60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136},
+ {31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56,
+ 40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64,
+ 45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78,
+ 52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90}},
+ {{31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59,
+ 32, 37, 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71,
+ 39, 46, 60, 76, 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92,
+ 54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118},
+ {31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51,
+ 40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58,
+ 46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71,
+ 50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82}},
+ {{31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53,
+ 32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63,
+ 37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79,
+ 50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97},
+ {31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49,
+ 37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55,
+ 46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65,
+ 48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73}},
+ {{31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49,
+ 32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54,
+ 35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71,
+ 45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87},
+ {31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48,
+ 35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50,
+ 47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61,
+ 47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68}},
+ {{31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41,
+ 32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44,
+ 34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58,
+ 40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67},
+ {31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45,
+ 34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47,
+ 42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56,
+ 47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59}},
+ {{31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37,
+ 32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40,
+ 32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51,
+ 35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58},
+ {31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45,
+ 31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46,
+ 38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53,
+ 48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56}},
+ {{31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35,
+ 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36,
+ 32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41,
+ 34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48},
+ {31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46,
+ 31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47,
+ 36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49,
+ 42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53}},
+ {{31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33,
+ 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35,
+ 32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37},
+ {31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38,
+ 31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40,
+ 33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45,
+ 38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47}},
+ {{31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+ {31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35,
+ 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36,
+ 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37,
+ 33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x16
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][128] = {
+ {{32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59,
+ 78, 86, 93, 32, 34, 36, 50, 59, 77, 82, 89, 34, 37,
+ 42, 54, 63, 79, 80, 88, 36, 38, 48, 60, 68, 84, 86,
+ 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76,
+ 85, 102, 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65,
+ 58, 68, 92, 105, 124, 122, 124, 79, 70, 79, 104, 118, 141,
+ 135, 135, 82, 72, 81, 106, 121, 144, 149, 146, 91, 80, 88,
+ 106, 130, 148, 162, 159, 97, 86, 94, 107, 128, 157, 167, 171,
+ 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101, 117, 138,
+ 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203},
+ {32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66,
+ 33, 43, 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62,
+ 49, 48, 53, 54, 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66,
+ 50, 46, 54, 64, 67, 73, 72, 70, 54, 49, 55, 68, 73, 80, 76, 75,
+ 57, 50, 56, 70, 76, 84, 80, 79, 63, 55, 60, 75, 82, 92, 87, 84,
+ 64, 56, 61, 75, 83, 93, 93, 89, 68, 59, 64, 74, 86, 94, 98, 94,
+ 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75, 83, 92, 101, 104,
+ 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109}},
+ {{32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60,
+ 72, 84, 90, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35,
+ 40, 47, 60, 71, 78, 85, 36, 37, 48, 56, 68, 78, 83,
+ 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69,
+ 84, 95, 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61,
+ 56, 65, 81, 100, 113, 116, 118, 71, 64, 73, 89, 111, 125,
+ 129, 129, 79, 70, 79, 95, 118, 133, 142, 138, 86, 76, 84,
+ 100, 124, 140, 153, 150, 92, 82, 89, 101, 121, 148, 157, 161,
+ 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110, 129,
+ 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188},
+ {32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65,
+ 33, 41, 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61,
+ 49, 48, 53, 54, 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65,
+ 50, 46, 54, 61, 66, 70, 71, 69, 52, 47, 54, 63, 71, 75, 75, 74,
+ 55, 49, 56, 65, 74, 79, 79, 78, 60, 53, 58, 68, 79, 85, 85, 82,
+ 63, 55, 60, 70, 82, 89, 91, 87, 66, 58, 62, 72, 84, 91, 95, 91,
+ 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73, 81, 89, 98, 101,
+ 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105}},
+ {{32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54,
+ 73, 81, 88, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34,
+ 38, 42, 52, 69, 76, 82, 34, 36, 44, 50, 59, 75, 81,
+ 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63,
+ 74, 90, 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57,
+ 53, 63, 74, 90, 108, 111, 113, 65, 59, 68, 79, 97, 118,
+ 123, 122, 71, 64, 73, 84, 102, 125, 135, 131, 81, 72, 80,
+ 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148, 151,
+ 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121,
+ 141, 160, 169, 103, 94, 92, 103, 119, 137, 158, 175},
+ {32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65,
+ 33, 40, 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60,
+ 44, 46, 51, 51, 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64,
+ 49, 45, 53, 58, 62, 67, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72,
+ 54, 49, 55, 62, 70, 77, 77, 76, 57, 51, 56, 64, 73, 82, 83, 81,
+ 60, 53, 58, 65, 75, 85, 89, 85, 64, 57, 61, 68, 78, 89, 93, 89,
+ 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71, 79, 87, 96, 98,
+ 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102}},
+ {{32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76,
+ 85, 31, 33, 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58,
+ 71, 79, 34, 35, 41, 48, 54, 63, 76, 81, 36, 36, 46, 54, 60,
+ 68, 80, 87, 41, 40, 49, 60, 67, 76, 88, 93, 47, 44, 53, 66,
+ 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106, 108, 58, 54, 61,
+ 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120, 124, 74, 67,
+ 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139, 142, 87,
+ 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150, 157,
+ 97, 88, 86, 97, 111, 128, 147, 163},
+ {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64,
+ 33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59,
+ 42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63,
+ 48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71,
+ 52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79,
+ 57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87,
+ 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95,
+ 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98}},
+ {{32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74,
+ 82, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58,
+ 69, 77, 33, 34, 38, 44, 52, 62, 72, 78, 36, 35, 42, 51, 58,
+ 68, 78, 84, 39, 38, 44, 54, 63, 73, 84, 89, 44, 41, 46, 59,
+ 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103, 53, 49, 53,
+ 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66, 60,
+ 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81,
+ 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147,
+ 91, 82, 80, 90, 103, 119, 137, 151},
+ {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63,
+ 31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58,
+ 41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62,
+ 48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69,
+ 50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77,
+ 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85,
+ 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92,
+ 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95}},
+ {{32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75,
+ 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71,
+ 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73,
+ 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84,
+ 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+ 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110,
+ 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125,
+ 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136},
+ {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+ 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+ 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+ 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+ 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+ 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+ 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+ 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90}},
+ {{32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69,
+ 31, 32, 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65,
+ 32, 33, 35, 38, 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69,
+ 35, 34, 38, 47, 52, 59, 65, 73, 38, 36, 40, 49, 56, 63, 69, 77,
+ 41, 39, 41, 51, 60, 67, 74, 81, 44, 42, 43, 54, 64, 72, 79, 86,
+ 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 50, 60, 71, 82, 90, 99,
+ 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68, 79, 92, 102, 112,
+ 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127},
+ {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57,
+ 30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54,
+ 37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56,
+ 47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61,
+ 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66,
+ 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73,
+ 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79,
+ 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86}},
+ {{32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62,
+ 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59,
+ 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58,
+ 34, 34, 37, 41, 44, 48, 54, 63, 36, 34, 38, 46, 50, 54, 60, 68,
+ 38, 37, 40, 47, 52, 57, 64, 72, 41, 39, 41, 49, 54, 60, 67, 76,
+ 44, 41, 43, 51, 57, 63, 71, 79, 48, 45, 46, 54, 60, 67, 76, 85,
+ 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97,
+ 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105},
+ {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54,
+ 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51,
+ 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+ 42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57,
+ 48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61,
+ 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67,
+ 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+ 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51,
+ 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49,
+ 32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49,
+ 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+ 36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63,
+ 39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71,
+ 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77,
+ 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87},
+ {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50,
+ 30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48,
+ 33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+ 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+ 49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56,
+ 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61,
+ 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64,
+ 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45,
+ 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+ 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+ 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47,
+ 34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55,
+ 36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60,
+ 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+ 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70},
+ {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48,
+ 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+ 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+ 37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47,
+ 42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53,
+ 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56,
+ 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+ 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48,
+ 34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54,
+ 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58,
+ 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63},
+ {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47,
+ 31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45,
+ 30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46,
+ 33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49,
+ 42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53,
+ 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56,
+ 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+ 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41,
+ 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+ 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48},
+ {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+ 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49,
+ 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38},
+ {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41,
+ 30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41,
+ 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+ {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36,
+ 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39,
+ 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][256] = {
+ {{32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82,
+ 88, 94, 31, 33, 34, 49, 59, 78, 86, 93, 31, 33, 35, 49,
+ 59, 78, 84, 90, 32, 34, 36, 50, 59, 77, 82, 89, 32, 35,
+ 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88,
+ 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84,
+ 86, 90, 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71,
+ 79, 95, 94, 97, 46, 44, 55, 73, 82, 98, 98, 99, 48, 46,
+ 56, 76, 85, 102, 105, 105, 53, 50, 60, 82, 92, 109, 107, 107,
+ 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66, 89, 101, 120,
+ 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73, 97,
+ 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71,
+ 80, 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146,
+ 88, 77, 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148,
+ 162, 159, 94, 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107,
+ 128, 157, 167, 171, 100, 89, 97, 111, 127, 152, 173, 182, 103, 93,
+ 98, 114, 131, 150, 174, 186, 107, 96, 100, 117, 136, 155, 177, 191,
+ 110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159,
+ 185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119,
+ 136, 156, 179, 204},
+ {32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67,
+ 30, 40, 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64,
+ 33, 43, 47, 47, 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62,
+ 42, 47, 50, 50, 53, 60, 59, 62, 45, 47, 51, 52, 55, 61, 61, 61,
+ 49, 48, 53, 54, 57, 62, 62, 62, 48, 47, 53, 57, 60, 66, 65, 64,
+ 49, 46, 53, 61, 64, 69, 66, 66, 49, 46, 53, 62, 65, 71, 68, 67,
+ 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66, 71, 77, 73, 71,
+ 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82, 79, 76,
+ 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81,
+ 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86,
+ 64, 56, 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91,
+ 68, 59, 64, 74, 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97,
+ 70, 62, 66, 73, 83, 96, 99, 98, 71, 63, 67, 74, 82, 93, 102, 102,
+ 72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106,
+ 74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109,
+ 76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99, 108}},
+ {{32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75,
+ 86, 91, 31, 32, 35, 44, 60, 72, 84, 90, 31, 33, 35, 44,
+ 59, 71, 82, 87, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35,
+ 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71, 78, 85,
+ 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78,
+ 83, 87, 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60,
+ 73, 84, 91, 94, 44, 42, 53, 66, 79, 90, 94, 96, 47, 45,
+ 56, 69, 84, 95, 101, 101, 49, 47, 57, 71, 86, 97, 103, 102,
+ 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79, 98, 110,
+ 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84,
+ 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68,
+ 76, 92, 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138,
+ 82, 73, 81, 97, 121, 136, 145, 144, 86, 76, 84, 100, 124, 140,
+ 153, 150, 89, 79, 87, 99, 124, 145, 156, 156, 92, 82, 89, 101,
+ 121, 148, 157, 161, 95, 85, 92, 105, 120, 143, 163, 171, 98, 88,
+ 93, 108, 124, 141, 163, 174, 101, 91, 94, 110, 128, 146, 166, 179,
+ 104, 94, 95, 110, 129, 151, 171, 181, 107, 97, 96, 110, 128, 149,
+ 173, 188, 110, 100, 98, 111, 127, 147, 169, 188, 114, 104, 100, 111,
+ 127, 145, 166, 190},
+ {32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66,
+ 30, 38, 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63,
+ 33, 41, 47, 46, 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62,
+ 39, 46, 48, 47, 51, 55, 58, 61, 42, 46, 50, 50, 53, 57, 60, 60,
+ 49, 48, 53, 54, 57, 60, 61, 61, 48, 47, 53, 55, 58, 62, 64, 63,
+ 48, 46, 53, 56, 60, 64, 65, 65, 49, 45, 53, 59, 64, 67, 67, 66,
+ 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61, 68, 71, 72, 70,
+ 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78, 78, 74,
+ 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79,
+ 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84,
+ 63, 55, 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89,
+ 66, 58, 62, 72, 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94,
+ 68, 60, 64, 71, 81, 94, 97, 96, 69, 61, 65, 72, 80, 91, 99, 100,
+ 70, 62, 65, 73, 81, 89, 98, 101, 71, 64, 65, 73, 82, 90, 99, 103,
+ 72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105,
+ 74, 67, 65, 71, 79, 89, 98, 105, 75, 68, 65, 71, 78, 87, 96, 105}},
+ {{32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75,
+ 83, 88, 31, 32, 35, 41, 54, 73, 81, 88, 31, 32, 34, 41,
+ 53, 72, 79, 84, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34,
+ 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69, 76, 82,
+ 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75,
+ 81, 84, 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58,
+ 68, 84, 88, 90, 40, 40, 51, 59, 70, 85, 91, 92, 44, 42,
+ 53, 63, 74, 90, 97, 97, 47, 45, 56, 66, 79, 95, 99, 98,
+ 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86, 103,
+ 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75,
+ 91, 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62,
+ 71, 83, 100, 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131,
+ 79, 71, 79, 90, 109, 133, 137, 136, 81, 72, 80, 91, 110, 135,
+ 145, 141, 82, 73, 81, 92, 111, 136, 147, 147, 87, 77, 85, 96,
+ 114, 140, 148, 151, 90, 80, 87, 99, 113, 135, 153, 160, 92, 83,
+ 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, 120, 137, 155, 168,
+ 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, 103, 120, 139,
+ 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97, 93, 104,
+ 118, 135, 155, 176},
+ {32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65,
+ 31, 36, 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62,
+ 33, 40, 47, 46, 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61,
+ 37, 44, 47, 45, 48, 54, 57, 60, 42, 45, 50, 49, 51, 57, 59, 59,
+ 44, 46, 51, 51, 53, 59, 60, 61, 49, 47, 53, 53, 55, 60, 63, 62,
+ 48, 46, 53, 56, 58, 64, 64, 64, 48, 46, 53, 56, 59, 65, 66, 65,
+ 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59, 65, 70, 70, 68,
+ 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75, 76, 73,
+ 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77,
+ 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82,
+ 60, 53, 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87,
+ 64, 57, 61, 68, 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92,
+ 66, 59, 63, 69, 79, 91, 94, 93, 67, 60, 63, 70, 78, 88, 96, 97,
+ 68, 61, 63, 71, 79, 87, 96, 98, 69, 62, 63, 71, 80, 88, 96, 100,
+ 70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102,
+ 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101}},
+ {{32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62,
+ 78, 86, 31, 32, 34, 41, 50, 61, 76, 85, 31, 32, 34, 41,
+ 49, 59, 74, 82, 31, 33, 35, 42, 49, 59, 73, 81, 32, 33,
+ 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58, 71, 79,
+ 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63,
+ 76, 81, 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54,
+ 60, 68, 80, 87, 39, 39, 48, 58, 65, 73, 86, 88, 41, 40,
+ 49, 60, 67, 76, 88, 93, 44, 42, 51, 63, 71, 79, 92, 94,
+ 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85,
+ 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72,
+ 84, 94, 108, 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58,
+ 65, 78, 91, 103, 118, 119, 65, 59, 66, 79, 92, 105, 120, 124,
+ 71, 64, 71, 84, 97, 111, 127, 129, 74, 67, 73, 86, 100, 113,
+ 131, 134, 79, 71, 77, 90, 104, 118, 136, 139, 82, 73, 79, 92,
+ 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, 150, 87, 78,
+ 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, 157,
+ 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130,
+ 151, 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97,
+ 110, 126, 144, 163},
+ {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64,
+ 31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61,
+ 33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60,
+ 37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58,
+ 42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61,
+ 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64,
+ 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67,
+ 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71,
+ 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+ 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80,
+ 57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85,
+ 61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89,
+ 64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94,
+ 66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97,
+ 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99,
+ 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98}},
+ {{32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62,
+ 75, 83, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, 33, 38,
+ 47, 59, 72, 79, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33,
+ 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58, 69, 77,
+ 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62,
+ 72, 78, 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51,
+ 58, 68, 78, 84, 36, 35, 42, 51, 59, 68, 79, 85, 39, 38,
+ 44, 54, 63, 73, 84, 89, 40, 39, 45, 56, 65, 75, 85, 90,
+ 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82,
+ 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65,
+ 78, 90, 101, 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53,
+ 57, 69, 83, 97, 109, 113, 58, 54, 57, 70, 84, 98, 110, 118,
+ 65, 59, 62, 74, 89, 105, 118, 122, 66, 60, 63, 75, 90, 106,
+ 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, 67, 69, 81,
+ 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, 73,
+ 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147,
+ 86, 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122,
+ 140, 152, 91, 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90,
+ 103, 117, 134, 152},
+ {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63,
+ 31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60,
+ 31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59,
+ 35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57,
+ 41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60,
+ 49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63,
+ 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66,
+ 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70,
+ 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+ 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78,
+ 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83,
+ 57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87,
+ 61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92,
+ 64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94,
+ 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96,
+ 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95}},
+ {{32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75,
+ 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72,
+ 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71,
+ 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, 42, 49, 58, 69,
+ 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73,
+ 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, 68, 78,
+ 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84,
+ 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90,
+ 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+ 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103,
+ 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110,
+ 58, 54, 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118,
+ 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125,
+ 71, 65, 63, 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133,
+ 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
+ 82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141},
+ {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+ 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57,
+ 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+ 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54,
+ 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+ 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60,
+ 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+ 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+ 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+ 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75,
+ 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+ 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82,
+ 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+ 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89,
+ 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90,
+ 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92}},
+ {{32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70,
+ 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67,
+ 31, 32, 33, 34, 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66,
+ 32, 32, 34, 36, 42, 50, 57, 65, 32, 32, 34, 37, 42, 49, 56, 65,
+ 32, 33, 35, 38, 42, 49, 56, 64, 32, 33, 35, 39, 43, 50, 56, 64,
+ 34, 34, 37, 42, 48, 54, 61, 69, 34, 34, 37, 42, 48, 54, 61, 69,
+ 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48, 54, 60, 66, 74,
+ 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65, 71, 79,
+ 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85,
+ 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91,
+ 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98,
+ 53, 49, 50, 60, 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103,
+ 58, 54, 54, 63, 75, 87, 95, 105, 61, 57, 56, 66, 77, 89, 98, 108,
+ 65, 60, 58, 68, 79, 92, 102, 112, 67, 61, 60, 69, 81, 94, 103, 114,
+ 71, 65, 63, 73, 84, 97, 108, 119, 72, 66, 64, 73, 85, 98, 108, 119,
+ 79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127},
+ {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58,
+ 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56,
+ 30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54,
+ 33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53,
+ 37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53,
+ 42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56,
+ 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59,
+ 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62,
+ 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+ 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69,
+ 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72,
+ 52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75,
+ 54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78,
+ 57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80,
+ 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83,
+ 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86}},
+ {{32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63,
+ 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61,
+ 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59,
+ 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, 38, 42, 50, 59,
+ 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58,
+ 32, 33, 35, 37, 40, 42, 49, 58, 33, 33, 36, 40, 43, 46, 53, 62,
+ 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43, 46, 50, 56, 65,
+ 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54, 60, 68,
+ 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73,
+ 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79,
+ 44, 41, 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84,
+ 48, 45, 46, 54, 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88,
+ 53, 49, 50, 57, 64, 71, 82, 92, 53, 49, 50, 57, 64, 71, 82, 92,
+ 57, 53, 53, 60, 67, 74, 86, 97, 58, 54, 54, 61, 68, 75, 87, 98,
+ 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105,
+ 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109},
+ {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55,
+ 31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53,
+ 30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52,
+ 33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51,
+ 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+ 37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52,
+ 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54,
+ 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57,
+ 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+ 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64,
+ 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66,
+ 50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68,
+ 52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71,
+ 54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73,
+ 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76,
+ 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52,
+ 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51,
+ 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49,
+ 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49,
+ 32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49,
+ 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49,
+ 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+ 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57,
+ 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+ 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65,
+ 39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69,
+ 44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72,
+ 47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76,
+ 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81,
+ 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83,
+ 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87},
+ {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50,
+ 31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50,
+ 30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48,
+ 31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47,
+ 33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47,
+ 37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+ 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+ 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52,
+ 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+ 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57,
+ 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60,
+ 49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61,
+ 50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64,
+ 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66,
+ 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67,
+ 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46,
+ 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45,
+ 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+ 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44,
+ 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+ 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45,
+ 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45,
+ 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+ 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+ 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56,
+ 36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58,
+ 39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60,
+ 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+ 44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67,
+ 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70,
+ 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73},
+ {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49,
+ 31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48,
+ 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+ 30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46,
+ 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+ 33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46,
+ 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46,
+ 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+ 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+ 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54,
+ 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55,
+ 48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56,
+ 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+ 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60,
+ 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61,
+ 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+ 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48,
+ 34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50,
+ 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54,
+ 36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56,
+ 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58,
+ 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60,
+ 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63},
+ {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48,
+ 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47,
+ 31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46,
+ 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+ 30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45,
+ 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46,
+ 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+ 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49,
+ 42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51,
+ 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53,
+ 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54,
+ 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56,
+ 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57,
+ 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35,
+ 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40,
+ 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42,
+ 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42,
+ 34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47,
+ 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48,
+ 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49},
+ {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46,
+ 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48,
+ 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50,
+ 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50,
+ 44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34,
+ 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35,
+ 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37,
+ 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38},
+ {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39,
+ 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40,
+ 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+ 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+ 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42,
+ 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45,
+ 35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47,
+ 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+ {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35,
+ 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36,
+ 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37,
+ 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38,
+ 31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40,
+ 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32,
+ 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32,
+ 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix16x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][512] = {
+ {{32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96,
+ 99, 102, 31, 32, 32, 34, 35, 42, 51, 56, 62, 75, 82, 85,
+ 88, 91, 94, 97, 31, 32, 33, 33, 34, 41, 49, 54, 59, 72,
+ 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41, 49, 54,
+ 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42,
+ 50, 54, 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37,
+ 38, 42, 49, 53, 58, 69, 75, 78, 82, 86, 89, 92, 34, 34,
+ 37, 39, 42, 48, 54, 58, 63, 73, 79, 78, 80, 83, 88, 92,
+ 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84,
+ 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87,
+ 86, 89, 90, 90, 39, 37, 40, 45, 50, 58, 65, 69, 73, 84,
+ 89, 89, 91, 91, 93, 96, 44, 41, 43, 48, 53, 63, 71, 75,
+ 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, 55, 65,
+ 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51,
+ 56, 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49,
+ 50, 54, 60, 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111,
+ 58, 54, 54, 58, 63, 75, 87, 92, 98, 110, 116, 115, 112, 111,
+ 115, 112, 61, 57, 56, 60, 66, 77, 89, 95, 101, 114, 120, 118,
+ 119, 118, 116, 120, 65, 60, 58, 63, 68, 79, 92, 98, 105, 118,
+ 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73, 84, 97, 103,
+ 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79, 90,
+ 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75,
+ 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75,
+ 72, 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143,
+ 88, 80, 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152,
+ 152, 154, 91, 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155,
+ 162, 160, 159, 155, 94, 85, 83, 82, 91, 100, 105, 118, 131, 137,
+ 153, 160, 165, 167, 166, 168, 97, 88, 86, 85, 94, 100, 107, 123,
+ 128, 140, 157, 161, 167, 173, 171, 169, 100, 91, 89, 87, 97, 100,
+ 111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94, 93, 90,
+ 98, 101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97,
+ 96, 93, 100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198,
+ 110, 101, 100, 97, 101, 108, 117, 123, 138, 141, 161, 165, 183, 188,
+ 193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167,
+ 185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151,
+ 157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136,
+ 136, 156, 156, 178, 179, 203, 204, 217},
+ {32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72,
+ 31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68,
+ 30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68,
+ 32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61, 62, 63, 64, 65,
+ 33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60, 60, 62, 63, 65,
+ 37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58, 60, 61, 62, 63,
+ 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58, 59, 60, 62, 63,
+ 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60, 61, 61,
+ 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62,
+ 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65,
+ 49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65,
+ 49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68,
+ 50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72, 72, 70, 70, 69,
+ 52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74, 73, 73, 71, 72,
+ 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78, 76, 74, 75, 73,
+ 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, 79, 78, 76, 77,
+ 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, 79, 77,
+ 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81,
+ 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81,
+ 64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86,
+ 64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87,
+ 67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96, 94, 92, 91, 91,
+ 68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96, 98, 96, 94, 91,
+ 69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98, 99, 98, 97, 96,
+ 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97, 99, 101, 98, 97,
+ 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102, 102, 102, 101,
+ 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102,
+ 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105, 106, 107,
+ 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106, 107,
+ 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109,
+ 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110,
+ 77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93,
+ 96, 99, 31, 32, 32, 33, 35, 42, 45, 51, 62, 69, 75, 83,
+ 86, 88, 91, 94, 31, 32, 32, 33, 35, 41, 44, 49, 60, 67,
+ 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41, 44, 49,
+ 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42,
+ 45, 50, 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36,
+ 38, 42, 45, 49, 58, 64, 69, 76, 80, 83, 86, 88, 32, 33,
+ 35, 36, 40, 44, 47, 51, 60, 66, 71, 76, 78, 81, 85, 89,
+ 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81,
+ 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84,
+ 83, 86, 87, 87, 38, 36, 39, 41, 49, 56, 58, 63, 71, 77,
+ 81, 86, 88, 88, 90, 93, 39, 37, 40, 42, 50, 58, 60, 65,
+ 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, 53, 63,
+ 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47,
+ 56, 66, 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46,
+ 47, 48, 57, 67, 71, 77, 86, 93, 97, 103, 103, 105, 102, 106,
+ 53, 49, 50, 51, 60, 71, 75, 82, 92, 99, 103, 111, 108, 107,
+ 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, 98, 105, 110, 114,
+ 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, 100, 107,
+ 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
+ 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84,
+ 89, 97, 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69,
+ 76, 88, 92, 101, 115, 123, 130, 134, 134, 131, 132, 135, 79, 72,
+ 70, 71, 79, 90, 95, 104, 118, 127, 133, 143, 142, 141, 138, 136,
+ 82, 75, 73, 74, 81, 92, 97, 106, 121, 130, 136, 146, 145, 144,
+ 144, 145, 86, 78, 76, 77, 84, 95, 100, 109, 124, 133, 140, 147,
+ 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99, 112, 124, 130,
+ 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95, 101, 116,
+ 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, 95,
+ 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85,
+ 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92,
+ 91, 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185,
+ 104, 95, 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175,
+ 181, 186, 107, 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156,
+ 173, 177, 188, 192, 110, 101, 100, 97, 98, 108, 111, 123, 127, 141,
+ 147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126,
+ 127, 145, 145, 166, 166, 189, 190, 201},
+ {32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71,
+ 31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67,
+ 30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67,
+ 31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61, 62, 63, 64,
+ 33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60, 61, 63, 64,
+ 37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59, 61, 62, 62,
+ 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58, 59, 61, 62,
+ 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59, 60, 60,
+ 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61,
+ 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64,
+ 48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64,
+ 49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67,
+ 50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71, 68, 69, 67,
+ 51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72, 72, 70, 71,
+ 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75, 73, 74, 71,
+ 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78, 76, 74, 75,
+ 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, 78, 75,
+ 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79,
+ 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80,
+ 62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84,
+ 63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84,
+ 64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92, 90, 89, 89,
+ 66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95, 93, 91, 89,
+ 67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96, 96, 94, 94,
+ 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97, 98, 96, 94,
+ 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99, 100, 98,
+ 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101, 99,
+ 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104,
+ 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104,
+ 73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106,
+ 74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106,
+ 75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109}},
+ {{32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90,
+ 93, 96, 31, 32, 32, 32, 35, 38, 42, 51, 55, 62, 75, 77,
+ 83, 86, 88, 91, 31, 32, 32, 32, 35, 38, 41, 50, 54, 60,
+ 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37, 41, 49,
+ 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39,
+ 42, 50, 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34,
+ 37, 40, 42, 49, 53, 58, 70, 71, 77, 80, 83, 85, 32, 33,
+ 34, 35, 38, 40, 42, 49, 52, 58, 69, 70, 76, 78, 82, 86,
+ 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79,
+ 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77,
+ 81, 83, 84, 84, 36, 34, 37, 38, 48, 51, 54, 60, 63, 68,
+ 78, 80, 85, 85, 86, 89, 39, 37, 39, 40, 50, 54, 58, 65,
+ 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, 51, 55,
+ 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43,
+ 53, 58, 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44,
+ 45, 46, 56, 61, 66, 75, 79, 85, 95, 97, 99, 101, 98, 102,
+ 49, 46, 46, 47, 57, 62, 67, 77, 81, 86, 97, 99, 104, 102,
+ 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, 103, 105,
+ 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97,
+ 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87,
+ 91, 98, 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73,
+ 79, 92, 97, 105, 118, 119, 123, 123, 122, 119, 69, 63, 62, 62,
+ 71, 76, 83, 96, 100, 109, 122, 124, 127, 125, 125, 128, 71, 65,
+ 64, 63, 73, 78, 84, 97, 102, 111, 125, 127, 135, 134, 131, 129,
+ 79, 72, 71, 70, 79, 84, 90, 104, 109, 118, 133, 135, 137, 136,
+ 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120, 135, 137,
+ 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121,
+ 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110,
+ 114, 125, 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89,
+ 99, 108, 113, 129, 135, 146, 153, 157, 160, 159, 92, 84, 83, 81,
+ 88, 90, 102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95, 87,
+ 85, 83, 88, 92, 103, 105, 120, 125, 137, 148, 155, 164, 168, 173,
+ 98, 89, 88, 85, 89, 95, 103, 108, 121, 124, 141, 144, 160, 164,
+ 169, 174, 100, 92, 91, 88, 90, 98, 103, 111, 120, 127, 139, 146,
+ 161, 165, 175, 179, 103, 94, 94, 90, 92, 101, 103, 114, 119, 131,
+ 137, 150, 158, 170, 175, 180, 106, 97, 97, 93, 93, 104, 104, 118,
+ 118, 135, 135, 154, 155, 175, 176, 187},
+ {32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69,
+ 31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66,
+ 31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66,
+ 30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61, 62, 63,
+ 33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60, 62, 63,
+ 35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60, 61, 61,
+ 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58, 60, 61,
+ 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58, 59, 59,
+ 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60,
+ 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63,
+ 48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63,
+ 48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66,
+ 49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67, 68, 66,
+ 50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71, 68, 70,
+ 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71, 72, 70,
+ 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75, 73, 73,
+ 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, 76, 74,
+ 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78,
+ 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78,
+ 59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82,
+ 60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82,
+ 63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88, 87, 86,
+ 64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91, 89, 87,
+ 64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93, 92, 91,
+ 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95, 93, 91,
+ 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97, 97, 95,
+ 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96,
+ 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101,
+ 70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101,
+ 71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103,
+ 72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103,
+ 73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88,
+ 90, 93, 31, 32, 32, 32, 35, 35, 42, 45, 51, 59, 62, 75,
+ 78, 83, 86, 88, 31, 32, 32, 32, 34, 35, 41, 45, 50, 58,
+ 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34, 41, 44,
+ 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36,
+ 42, 44, 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34,
+ 36, 36, 42, 45, 50, 57, 59, 71, 73, 78, 80, 82, 32, 33,
+ 34, 35, 37, 38, 42, 45, 49, 56, 58, 69, 71, 76, 79, 83,
+ 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76,
+ 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73,
+ 76, 81, 81, 80, 35, 34, 36, 38, 45, 47, 52, 55, 59, 65,
+ 67, 77, 79, 82, 83, 86, 36, 34, 36, 38, 46, 48, 54, 56,
+ 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, 48, 50,
+ 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41,
+ 49, 51, 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41,
+ 42, 43, 51, 53, 63, 66, 71, 78, 79, 90, 92, 97, 94, 97,
+ 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, 84, 95, 97, 98,
+ 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96,
+ 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90,
+ 92, 103, 106, 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77,
+ 84, 92, 94, 106, 108, 111, 110, 112, 58, 54, 54, 54, 61, 63,
+ 75, 79, 87, 95, 98, 110, 112, 117, 116, 113, 63, 58, 58, 57,
+ 65, 67, 78, 83, 91, 100, 103, 116, 118, 119, 119, 121, 65, 60,
+ 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120, 127, 124, 122,
+ 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127, 129,
+ 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128,
+ 131, 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115,
+ 118, 133, 136, 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97,
+ 105, 117, 120, 136, 139, 145, 142, 140, 82, 75, 74, 72, 79, 81,
+ 92, 97, 106, 117, 121, 136, 139, 148, 150, 149, 87, 79, 78, 76,
+ 83, 85, 96, 100, 110, 120, 125, 141, 144, 148, 153, 150, 89, 82,
+ 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, 153, 157, 161,
+ 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, 153,
+ 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136,
+ 151, 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123,
+ 128, 140, 147, 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110,
+ 110, 126, 126, 144, 144, 163, 163, 173},
+ {32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
+ 31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
+ 34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
+ 40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
+ 46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
+ 47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
+ 45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
+ 47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
+ 50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
+ 55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
+ 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
+ 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
+ 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
+ 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
+ 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
+ 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
+ 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
+ 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+ 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
+ 50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
+ 57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
+ 59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
+ 67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
+ 71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
+ 75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
+ 82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
+ 85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
+ 88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
+ 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
+ 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
+ 98, 102}},
+ {{32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81,
+ 87, 90, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51, 62, 63,
+ 75, 77, 83, 85, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51,
+ 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34, 38, 41,
+ 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35,
+ 38, 41, 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34,
+ 35, 36, 39, 42, 48, 50, 59, 60, 71, 72, 78, 80, 32, 32,
+ 33, 35, 36, 37, 40, 42, 48, 49, 58, 59, 69, 71, 77, 80,
+ 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70,
+ 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63,
+ 72, 74, 78, 78, 34, 34, 34, 37, 39, 42, 45, 48, 53, 54,
+ 63, 64, 73, 75, 80, 83, 36, 34, 35, 38, 42, 48, 51, 54,
+ 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, 42, 48,
+ 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40,
+ 44, 50, 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38,
+ 39, 41, 45, 51, 56, 59, 65, 67, 75, 76, 85, 87, 90, 93,
+ 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, 79, 80, 90, 91,
+ 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83,
+ 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76,
+ 85, 86, 96, 98, 103, 100, 52, 48, 48, 49, 52, 59, 65, 70,
+ 78, 80, 90, 91, 101, 103, 105, 107, 53, 49, 49, 50, 53, 60,
+ 66, 71, 79, 82, 92, 93, 103, 105, 111, 107, 58, 53, 53, 53,
+ 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58, 54,
+ 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115,
+ 65, 60, 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119,
+ 122, 123, 66, 61, 60, 59, 63, 69, 75, 80, 90, 93, 106, 107,
+ 119, 121, 126, 123, 71, 65, 65, 63, 67, 73, 79, 84, 94, 97,
+ 111, 112, 125, 127, 131, 132, 74, 68, 67, 66, 69, 75, 81, 86,
+ 97, 100, 113, 115, 128, 130, 134, 132, 79, 72, 72, 70, 73, 79,
+ 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74, 73, 71,
+ 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75,
+ 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151,
+ 86, 78, 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142,
+ 147, 151, 88, 81, 80, 77, 80, 86, 90, 98, 105, 112, 122, 127,
+ 140, 144, 152, 155, 91, 83, 82, 79, 80, 88, 90, 100, 103, 114,
+ 119, 130, 137, 148, 151, 155, 93, 85, 85, 81, 81, 90, 90, 102,
+ 103, 117, 117, 134, 134, 151, 152, 160},
+ {32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
+ 31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
+ 33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
+ 40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
+ 43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
+ 47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
+ 46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
+ 45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
+ 49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
+ 50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
+ 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
+ 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
+ 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
+ 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
+ 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
+ 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
+ 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
+ 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+ 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
+ 49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
+ 52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
+ 57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
+ 62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
+ 66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
+ 73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
+ 75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
+ 83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
+ 85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
+ 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
+ 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
+ 95, 98}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79,
+ 79, 87, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51, 51, 62,
+ 62, 75, 75, 82, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51,
+ 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34, 34, 41,
+ 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34,
+ 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34,
+ 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 32,
+ 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77,
+ 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69,
+ 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58,
+ 58, 69, 69, 75, 34, 34, 34, 37, 37, 42, 42, 48, 48, 54,
+ 54, 63, 63, 73, 73, 79, 34, 34, 34, 37, 37, 42, 42, 48,
+ 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, 38, 48,
+ 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38,
+ 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37,
+ 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84, 84, 89,
+ 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84,
+ 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79,
+ 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71,
+ 71, 79, 79, 90, 90, 95, 48, 45, 45, 46, 46, 56, 56, 67,
+ 67, 76, 76, 85, 85, 96, 96, 102, 48, 45, 45, 46, 46, 56,
+ 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49, 49, 50,
+ 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49,
+ 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109,
+ 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110,
+ 110, 116, 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98,
+ 98, 110, 110, 116, 65, 60, 60, 58, 58, 68, 68, 79, 79, 92,
+ 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68, 79,
+ 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73,
+ 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63,
+ 63, 73, 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 79, 72,
+ 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133, 133, 141,
+ 79, 72, 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133,
+ 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106, 106, 121,
+ 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106,
+ 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96,
+ 96, 109, 109, 124, 124, 141, 141, 149},
+ {32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
+ 31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
+ 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
+ 40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
+ 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
+ 47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
+ 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
+ 45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
+ 45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
+ 50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
+ 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
+ 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
+ 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
+ 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
+ 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
+ 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
+ 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
+ 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+ 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
+ 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
+ 49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
+ 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
+ 56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
+ 64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
+ 65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
+ 72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
+ 75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
+ 82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
+ 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
+ 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
+ 92, 95}},
+ {{32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79,
+ 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75,
+ 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75,
+ 31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50, 50, 58, 60, 67, 73,
+ 31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49, 49, 57, 59, 66, 72,
+ 31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49, 49, 57, 59, 66, 71,
+ 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50, 57, 59, 65, 71,
+ 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59, 65, 70,
+ 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69,
+ 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69,
+ 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73,
+ 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73,
+ 35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59, 59, 65, 67, 73, 77,
+ 36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60, 60, 66, 68, 74, 78,
+ 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63, 63, 69, 71, 77, 81,
+ 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, 71, 73, 79, 84,
+ 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, 81, 86,
+ 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90,
+ 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91,
+ 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96,
+ 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96,
+ 53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81, 81, 89, 91, 98, 103,
+ 53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82, 82, 90, 92, 99, 103,
+ 57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85, 85, 94, 96, 103, 108,
+ 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87, 87, 95, 98, 105, 110,
+ 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89, 98, 101, 108, 114,
+ 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102, 105, 112, 118,
+ 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, 114, 120,
+ 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, 125,
+ 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133},
+ {32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
+ 31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
+ 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
+ 37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
+ 40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
+ 44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
+ 47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
+ 47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
+ 45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
+ 46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
+ 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
+ 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
+ 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
+ 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
+ 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
+ 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
+ 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
+ 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+ 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
+ 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
+ 47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
+ 50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
+ 55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
+ 57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
+ 63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
+ 64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
+ 71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
+ 72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
+ 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
+ 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
+ 86, 89}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65,
+ 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63,
+ 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62,
+ 31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54, 61, 61,
+ 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59,
+ 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59,
+ 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53, 59, 59,
+ 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53, 59, 59,
+ 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58,
+ 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58,
+ 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58,
+ 33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62,
+ 34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57, 63, 63,
+ 34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59, 65, 65,
+ 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68,
+ 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68,
+ 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, 72, 72,
+ 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73,
+ 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76,
+ 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79,
+ 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79,
+ 47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78, 84, 84,
+ 48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79, 85, 85,
+ 50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82, 88, 88,
+ 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92,
+ 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92,
+ 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97,
+ 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98,
+ 61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100,
+ 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105,
+ 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105,
+ 70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109},
+ {32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
+ 31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
+ 31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
+ 34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
+ 40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
+ 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
+ 46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
+ 47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
+ 46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
+ 45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
+ 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
+ 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
+ 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
+ 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
+ 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
+ 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
+ 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
+ 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+ 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
+ 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
+ 46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
+ 46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
+ 52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
+ 54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
+ 57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
+ 62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
+ 62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
+ 68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
+ 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
+ 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
+ 78, 78}},
+ {{32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
+ 32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
+ 32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
+ 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
+ 32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
+ 33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
+ 33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
+ 36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
+ 36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
+ 38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
+ 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
+ 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
+ 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
+ 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
+ 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
+ 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+ 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
+ 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+ 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
+ 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
+ 42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
+ 43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
+ 45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
+ 54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
+ 56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
+ 60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
+ 71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
+ 71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
+ 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
+ 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
+ 87, 92},
+ {32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
+ 31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
+ 31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
+ 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
+ 38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
+ 40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
+ 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
+ 46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
+ 47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
+ 47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
+ 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
+ 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
+ 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
+ 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
+ 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
+ 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+ 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
+ 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+ 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
+ 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
+ 46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
+ 46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
+ 47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
+ 53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
+ 54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
+ 55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
+ 61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
+ 61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
+ 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
+ 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
+ 68, 71}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
+ 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
+ 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
+ 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
+ 33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
+ 35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
+ 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
+ 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
+ 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
+ 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
+ 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
+ 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
+ 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
+ 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
+ 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+ 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
+ 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
+ 37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
+ 40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
+ 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
+ 43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
+ 51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
+ 53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
+ 54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
+ 61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
+ 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
+ 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
+ 73, 79},
+ {32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
+ 31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
+ 31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
+ 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
+ 34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
+ 40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
+ 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
+ 41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
+ 46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
+ 47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
+ 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
+ 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
+ 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
+ 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+ 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
+ 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
+ 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
+ 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+ 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
+ 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
+ 46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
+ 47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
+ 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
+ 47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
+ 51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
+ 53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
+ 53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
+ 57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
+ 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
+ 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
+ 62, 65}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
+ 34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+ 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
+ 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
+ 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
+ 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+ 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
+ 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+ 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
+ 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+ 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
+ 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
+ 34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
+ 36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
+ 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
+ 38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
+ 39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
+ 45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
+ 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
+ 50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
+ 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
+ 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
+ 63, 63},
+ {32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
+ 31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
+ 31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
+ 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+ 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
+ 35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
+ 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
+ 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
+ 40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
+ 44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
+ 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
+ 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
+ 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
+ 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+ 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
+ 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
+ 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
+ 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+ 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
+ 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
+ 44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
+ 47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
+ 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
+ 48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
+ 47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
+ 50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
+ 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
+ 53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
+ 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
+ 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
+ 58, 58}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+ 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
+ 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
+ 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+ 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+ 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
+ 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+ 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
+ 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
+ 33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
+ 33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
+ 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
+ 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+ 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
+ 37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
+ 38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
+ 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
+ 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
+ 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
+ 49, 49},
+ {32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
+ 31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
+ 31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
+ 31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
+ 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
+ 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
+ 34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
+ 38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
+ 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
+ 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
+ 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
+ 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
+ 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
+ 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
+ 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
+ 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
+ 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
+ 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+ 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
+ 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
+ 41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
+ 42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
+ 44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
+ 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
+ 47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
+ 47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
+ 48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
+ 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
+ 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
+ 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
+ 53, 53}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
+ 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
+ 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
+ 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+ 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+ 35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
+ 36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+ 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+ 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
+ 38, 39},
+ {32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
+ 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
+ 31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
+ 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
+ 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
+ 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
+ 34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
+ 37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
+ 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
+ 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
+ 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
+ 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
+ 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
+ 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
+ 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
+ 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+ 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
+ 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
+ 37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
+ 38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
+ 39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
+ 41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
+ 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
+ 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
+ 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
+ 47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
+ 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+ 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+ 48, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
+ 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+ 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
+ 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
+ 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+ 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
+ 32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
+ 34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
+ 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
+ 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+ 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
+ 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
+ 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
+ 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
+ 42, 44}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32}}};
+constexpr uint8_t
+ kQuantizerMatrix4x4[kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
+ [10] = {{{32, 43, 67, 73, 94, 137, 97, 110, 150, 200},
+ {35, 46, 60, 57, 69, 90, 66, 71, 90, 109}},
+ {{32, 41, 63, 69, 88, 127, 92, 103, 140, 184},
+ {33, 45, 58, 56, 66, 86, 64, 69, 87, 105}},
+ {{32, 38, 56, 63, 78, 113, 86, 97, 130, 169},
+ {32, 45, 55, 53, 62, 80, 63, 67, 84, 101}},
+ {{32, 37, 54, 58, 72, 102, 81, 91, 121, 156},
+ {32, 45, 54, 51, 59, 75, 61, 65, 81, 97}},
+ {{32, 34, 49, 53, 64, 91, 75, 81, 112, 140},
+ {32, 46, 53, 49, 55, 70, 58, 62, 78, 91}},
+ {{32, 34, 48, 49, 60, 82, 72, 79, 104, 134},
+ {32, 46, 53, 47, 54, 66, 57, 60, 75, 89}},
+ {{32, 33, 39, 45, 51, 71, 62, 64, 87, 108},
+ {31, 42, 48, 47, 50, 61, 53, 54, 67, 78}},
+ {{32, 33, 38, 42, 46, 63, 55, 57, 75, 92},
+ {31, 41, 48, 46, 48, 58, 51, 51, 62, 71}},
+ {{32, 32, 35, 38, 40, 54, 51, 49, 64, 81},
+ {31, 38, 47, 47, 46, 54, 49, 46, 57, 66}},
+ {{32, 32, 34, 35, 37, 48, 43, 43, 54, 65},
+ {31, 37, 44, 47, 47, 53, 47, 45, 53, 59}},
+ {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 54}},
+ {{32, 32, 32, 32, 33, 35, 35, 35, 38, 46},
+ {31, 32, 34, 38, 41, 47, 46, 46, 47, 52}},
+ {{31, 32, 32, 32, 32, 33, 32, 33, 34, 35},
+ {31, 31, 32, 34, 35, 39, 38, 40, 43, 47}},
+ {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33},
+ {31, 31, 31, 31, 31, 32, 34, 35, 35, 39}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix8x8
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][36] = {
+ {{32, 32, 35, 38, 40, 54, 51, 49, 65, 82, 68, 63,
+ 78, 97, 117, 84, 76, 91, 111, 134, 152, 95, 89, 98,
+ 113, 138, 159, 183, 109, 102, 106, 121, 142, 168, 199, 220},
+ {31, 38, 47, 47, 46, 54, 50, 47, 57, 66, 57, 52,
+ 61, 72, 82, 63, 57, 66, 77, 88, 96, 67, 62, 67,
+ 75, 86, 95, 104, 71, 67, 68, 75, 84, 95, 107, 113}},
+ {{32, 32, 35, 37, 39, 51, 47, 46, 60, 73, 62, 58,
+ 71, 87, 105, 78, 72, 84, 100, 121, 140, 90, 84, 93,
+ 106, 129, 148, 169, 102, 96, 100, 113, 132, 155, 183, 201},
+ {31, 38, 47, 47, 47, 53, 48, 46, 55, 62, 54, 50,
+ 58, 67, 76, 61, 55, 63, 72, 83, 91, 66, 61, 65,
+ 73, 84, 92, 101, 69, 65, 66, 73, 82, 92, 103, 109}},
+ {{32, 32, 34, 35, 37, 48, 46, 45, 56, 70, 57, 54,
+ 64, 80, 93, 76, 70, 79, 96, 111, 134, 85, 79, 87,
+ 100, 121, 138, 156, 96, 90, 93, 105, 122, 144, 168, 184},
+ {31, 36, 43, 47, 47, 53, 48, 46, 54, 61, 52, 49,
+ 55, 65, 71, 60, 55, 60, 70, 78, 89, 64, 59, 63,
+ 71, 81, 89, 97, 67, 63, 64, 71, 79, 89, 99, 104}},
+ {{32, 32, 33, 35, 36, 46, 42, 42, 52, 63, 53, 51,
+ 60, 73, 86, 68, 64, 72, 84, 100, 117, 78, 74, 80,
+ 92, 109, 128, 140, 90, 84, 87, 98, 114, 133, 155, 168},
+ {31, 34, 39, 46, 47, 52, 47, 45, 52, 58, 50, 48,
+ 54, 62, 68, 57, 53, 58, 65, 73, 82, 61, 57, 61,
+ 68, 77, 86, 91, 65, 61, 62, 68, 76, 86, 95, 100}},
+ {{32, 32, 33, 34, 35, 39, 39, 40, 46, 56, 50, 48,
+ 53, 65, 78, 62, 59, 63, 75, 90, 105, 76, 71, 74,
+ 86, 101, 118, 134, 84, 79, 81, 92, 106, 123, 142, 153},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 55, 49, 47,
+ 50, 58, 65, 54, 51, 53, 61, 69, 76, 60, 56, 57,
+ 65, 73, 82, 89, 64, 59, 60, 66, 74, 83, 92, 96}},
+ {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54, 46, 45,
+ 51, 61, 71, 56, 54, 58, 69, 80, 92, 68, 64, 68,
+ 78, 90, 103, 117, 78, 74, 76, 86, 99, 113, 128, 140},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 54, 48, 46,
+ 50, 56, 61, 52, 49, 52, 58, 65, 71, 57, 53, 55,
+ 61, 68, 75, 82, 61, 57, 58, 64, 71, 79, 86, 91}},
+ {{31, 32, 32, 32, 33, 35, 35, 35, 38, 48, 42, 41,
+ 43, 54, 63, 51, 49, 49, 59, 71, 81, 59, 56, 56,
+ 66, 77, 89, 98, 69, 65, 64, 73, 85, 97, 108, 119},
+ {31, 32, 35, 38, 42, 47, 48, 47, 48, 53, 47, 45,
+ 45, 53, 58, 50, 47, 47, 54, 61, 66, 53, 50, 49,
+ 56, 63, 69, 73, 57, 54, 52, 58, 65, 72, 77, 82}},
+ {{31, 32, 32, 32, 32, 35, 34, 34, 37, 42, 38, 37,
+ 40, 47, 54, 46, 44, 45, 52, 60, 69, 52, 49, 49,
+ 56, 65, 75, 82, 63, 59, 58, 65, 73, 84, 92, 105},
+ {31, 31, 32, 38, 40, 47, 44, 44, 47, 50, 47, 45,
+ 46, 51, 54, 48, 46, 46, 51, 56, 61, 50, 47, 47,
+ 52, 57, 63, 66, 55, 52, 50, 54, 60, 66, 70, 76}},
+ {{31, 32, 32, 32, 32, 34, 34, 33, 35, 39, 35, 34,
+ 37, 42, 48, 41, 40, 41, 47, 53, 60, 47, 44, 45,
+ 51, 57, 65, 71, 53, 50, 51, 55, 61, 70, 77, 85},
+ {31, 31, 32, 35, 36, 41, 42, 42, 45, 48, 48, 46,
+ 47, 50, 53, 47, 45, 45, 49, 53, 57, 49, 46, 46,
+ 50, 54, 59, 61, 51, 48, 48, 51, 54, 60, 64, 68}},
+ {{31, 31, 32, 32, 32, 33, 32, 32, 34, 35, 34, 34,
+ 35, 37, 41, 37, 36, 38, 39, 45, 51, 43, 41, 42,
+ 42, 49, 56, 63, 47, 44, 45, 46, 52, 59, 67, 71},
+ {31, 31, 32, 34, 35, 39, 37, 40, 43, 47, 43, 43,
+ 45, 47, 49, 48, 46, 46, 47, 50, 53, 47, 45, 45,
+ 45, 50, 55, 58, 49, 46, 46, 46, 50, 55, 60, 61}},
+ {{31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 33, 33,
+ 34, 35, 37, 34, 34, 35, 36, 39, 43, 37, 36, 37,
+ 38, 41, 46, 51, 41, 39, 40, 41, 44, 49, 54, 58},
+ {31, 31, 31, 32, 33, 35, 35, 37, 39, 43, 39, 41,
+ 42, 45, 47, 45, 44, 45, 47, 48, 50, 48, 46, 46,
+ 47, 48, 51, 53, 48, 46, 45, 46, 47, 51, 54, 56}},
+ {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33, 32, 32,
+ 32, 34, 35, 32, 33, 33, 34, 35, 36, 34, 34, 33,
+ 35, 36, 38, 39, 35, 35, 34, 36, 38, 40, 42, 48},
+ {31, 31, 31, 30, 31, 32, 34, 34, 35, 39, 36, 37,
+ 39, 42, 46, 39, 40, 41, 44, 47, 47, 42, 42, 42,
+ 45, 47, 48, 48, 48, 47, 46, 47, 47, 49, 50, 53}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32,
+ 32, 34, 34, 35, 33, 33, 33, 33, 35, 35, 36, 38},
+ {31, 31, 31, 31, 31, 31, 30, 31, 31, 32, 34, 34,
+ 35, 35, 39, 35, 35, 36, 36, 40, 41, 37, 38, 39,
+ 40, 43, 44, 47, 40, 41, 41, 42, 44, 45, 47, 48}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 33, 33},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+ 31, 31, 32, 31, 32, 32, 32, 32, 33, 33, 34, 34,
+ 35, 35, 36, 39, 33, 34, 34, 35, 35, 36, 39, 39}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix32x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][528] = {
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 33,
+ 33, 32, 32, 32, 33, 34, 35, 34, 34, 33, 34, 35, 37, 39,
+ 35, 34, 34, 35, 36, 37, 41, 43, 36, 35, 34, 35, 36, 38,
+ 42, 45, 48, 39, 38, 37, 38, 39, 40, 45, 47, 50, 54, 44,
+ 42, 41, 41, 42, 42, 47, 50, 54, 58, 63, 46, 44, 42, 43,
+ 44, 44, 49, 52, 55, 59, 65, 67, 48, 46, 44, 45, 45, 46,
+ 51, 53, 57, 61, 67, 69, 71, 54, 51, 49, 49, 50, 49, 54,
+ 57, 60, 65, 71, 74, 76, 82, 59, 56, 54, 54, 54, 53, 58,
+ 61, 64, 69, 75, 78, 80, 87, 92, 62, 59, 56, 56, 56, 55,
+ 60, 63, 66, 71, 77, 80, 83, 89, 95, 98, 65, 62, 59, 59,
+ 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 71,
+ 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103,
+ 107, 111, 117, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90,
+ 93, 96, 104, 110, 114, 118, 125, 134, 81, 77, 73, 73, 72, 70,
+ 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137,
+ 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106,
+ 113, 117, 121, 128, 137, 138, 140, 88, 84, 80, 79, 78, 76, 80,
+ 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147,
+ 152, 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107,
+ 110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94, 89, 86, 85,
+ 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136,
+ 139, 146, 156, 158, 161, 166, 97, 92, 90, 88, 86, 85, 84, 89,
+ 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163,
+ 166, 168, 174, 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101,
+ 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176,
+ 183, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111,
+ 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+ 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120,
+ 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+ 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120,
+ 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202,
+ 210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119,
+ 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204,
+ 210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112,
+ 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193,
+ 197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107,
+ 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176,
+ 177, 190, 191, 204, 206, 222, 224, 230, 232, 242},
+ {32, 31, 31, 30, 31, 32, 32, 33, 33, 35, 33, 34, 35, 37,
+ 39, 36, 38, 40, 41, 43, 47, 41, 42, 42, 43, 45, 47, 48,
+ 45, 45, 44, 45, 46, 47, 49, 50, 49, 47, 46, 47, 47, 48,
+ 50, 51, 53, 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 49,
+ 47, 45, 45, 45, 45, 49, 51, 53, 55, 58, 50, 47, 45, 46,
+ 46, 46, 49, 51, 54, 56, 59, 60, 50, 48, 46, 46, 46, 46,
+ 50, 52, 54, 56, 60, 60, 61, 52, 50, 47, 47, 47, 47, 50,
+ 52, 54, 57, 61, 62, 63, 66, 54, 52, 49, 49, 49, 48, 52,
+ 53, 55, 58, 62, 64, 65, 68, 71, 56, 53, 51, 50, 50, 49,
+ 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 57, 54, 52, 51,
+ 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76, 60,
+ 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75,
+ 77, 79, 82, 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67,
+ 69, 71, 75, 78, 80, 82, 85, 89, 64, 61, 58, 57, 57, 55,
+ 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90,
+ 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75,
+ 79, 81, 83, 86, 90, 91, 91, 67, 63, 61, 60, 59, 57, 60,
+ 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94,
+ 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74,
+ 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, 69, 65, 63, 62,
+ 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87,
+ 89, 92, 96, 97, 98, 100, 70, 66, 64, 63, 62, 61, 60, 63,
+ 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98,
+ 99, 100, 102, 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68,
+ 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102,
+ 104, 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73,
+ 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106,
+ 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76,
+ 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109,
+ 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75,
+ 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108,
+ 111, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74,
+ 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108,
+ 110, 111, 113, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70,
+ 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104,
+ 105, 109, 111, 112, 113, 116, 78, 74, 74, 70, 70, 69, 69, 66,
+ 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96,
+ 97, 102, 102, 107, 107, 112, 113, 115, 115, 118}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 32, 32, 32, 33, 34, 35, 32, 33, 33, 33, 34, 36, 36,
+ 34, 34, 33, 34, 35, 37, 38, 39, 36, 35, 34, 35, 36, 38,
+ 40, 42, 48, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 39,
+ 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 44, 42, 41, 41,
+ 42, 42, 44, 47, 54, 56, 58, 63, 47, 45, 44, 44, 45, 45,
+ 47, 50, 56, 58, 60, 66, 69, 49, 47, 46, 45, 46, 46, 48,
+ 51, 57, 60, 62, 68, 71, 73, 54, 51, 50, 49, 50, 49, 51,
+ 54, 60, 63, 65, 71, 75, 77, 82, 59, 56, 54, 54, 54, 53,
+ 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 61, 58, 56, 56,
+ 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 65,
+ 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92,
+ 98, 101, 105, 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78,
+ 84, 89, 92, 97, 103, 106, 111, 117, 76, 72, 70, 69, 68, 66,
+ 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127,
+ 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98,
+ 104, 110, 113, 118, 125, 130, 134, 83, 78, 76, 75, 74, 72, 73,
+ 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137,
+ 140, 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100,
+ 103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89, 85, 82, 81,
+ 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128,
+ 131, 136, 146, 147, 150, 155, 92, 88, 85, 84, 82, 81, 80, 85,
+ 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152,
+ 154, 156, 162, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95,
+ 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163,
+ 169, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104,
+ 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+ 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112,
+ 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+ 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112,
+ 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186,
+ 193, 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111,
+ 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188,
+ 192, 194, 201, 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105,
+ 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
+ 181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99,
+ 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162,
+ 162, 175, 176, 187, 188, 203, 204, 210, 211, 219},
+ {32, 31, 31, 30, 31, 31, 31, 32, 32, 33, 33, 34, 35, 36, 39,
+ 36, 38, 39, 40, 43, 47, 38, 40, 41, 41, 44, 47, 47, 41, 42,
+ 42, 43, 45, 47, 48, 48, 49, 47, 46, 46, 47, 48, 49, 50, 53,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 48, 47, 46, 45, 46,
+ 46, 48, 49, 53, 54, 54, 49, 47, 45, 45, 45, 45, 47, 49, 53,
+ 55, 55, 58, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62,
+ 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66,
+ 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68,
+ 71, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66,
+ 69, 72, 73, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63,
+ 66, 67, 70, 73, 74, 76, 60, 57, 55, 54, 53, 52, 53, 55, 58,
+ 60, 61, 65, 68, 69, 72, 75, 77, 79, 82, 62, 59, 57, 56, 55,
+ 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86,
+ 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75,
+ 78, 79, 82, 85, 87, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61,
+ 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 66, 63,
+ 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81,
+ 84, 87, 90, 91, 93, 94, 67, 64, 62, 61, 59, 58, 58, 60, 63,
+ 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97,
+ 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75,
+ 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, 69, 66, 64, 63, 61,
+ 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88,
+ 91, 92, 97, 98, 98, 101, 70, 67, 65, 63, 62, 62, 60, 61, 63,
+ 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99,
+ 100, 100, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
+ 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102,
+ 105, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73,
+ 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107,
+ 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77,
+ 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109,
+ 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76,
+ 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108,
+ 111, 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75,
+ 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108,
+ 110, 110, 113}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 34, 34, 35,
+ 34, 34, 33, 33, 35, 36, 37, 39, 34, 34, 34, 34, 36, 36,
+ 37, 41, 42, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 39,
+ 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, 41, 39, 39, 38,
+ 40, 40, 41, 46, 48, 51, 55, 56, 44, 42, 41, 41, 42, 42,
+ 42, 47, 50, 54, 58, 59, 63, 48, 46, 45, 44, 45, 45, 45,
+ 50, 53, 56, 61, 62, 66, 70, 49, 47, 46, 45, 46, 46, 46,
+ 51, 53, 57, 62, 63, 68, 71, 73, 54, 51, 50, 49, 50, 49,
+ 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, 58, 55, 54, 53,
+ 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 59,
+ 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82,
+ 87, 91, 93, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73,
+ 75, 79, 85, 87, 92, 97, 99, 105, 69, 66, 64, 63, 63, 62,
+ 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113,
+ 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90,
+ 92, 97, 102, 104, 111, 115, 117, 80, 76, 73, 72, 71, 70, 69,
+ 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125,
+ 134, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91,
+ 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 83, 78, 76, 75,
+ 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113,
+ 121, 126, 128, 137, 139, 140, 87, 83, 81, 79, 78, 77, 75, 80,
+ 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142,
+ 143, 145, 150, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89,
+ 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+ 156, 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98,
+ 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+ 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105,
+ 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+ 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105,
+ 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170,
+ 176, 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104,
+ 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172,
+ 176, 177, 184, 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98,
+ 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163,
+ 166, 177, 179, 184, 185, 191, 107, 101, 101, 97, 97, 95, 95, 93,
+ 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149,
+ 149, 161, 161, 172, 172, 185, 186, 191, 192, 199},
+ {32, 31, 31, 30, 31, 31, 30, 31, 31, 32, 33, 34, 35, 35, 39,
+ 35, 36, 37, 37, 41, 43, 36, 38, 39, 40, 43, 45, 47, 41, 42,
+ 42, 42, 45, 46, 47, 48, 44, 44, 44, 44, 46, 46, 47, 49, 50,
+ 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 48, 47, 46, 45, 46,
+ 46, 46, 49, 51, 53, 54, 48, 47, 46, 45, 46, 46, 46, 49, 51,
+ 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56,
+ 58, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61,
+ 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62,
+ 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64,
+ 66, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64,
+ 65, 68, 70, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60,
+ 62, 65, 66, 68, 70, 71, 57, 54, 53, 52, 51, 50, 50, 53, 54,
+ 56, 60, 61, 63, 66, 67, 70, 73, 73, 76, 59, 56, 54, 53, 53,
+ 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80,
+ 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69,
+ 72, 75, 76, 79, 81, 82, 63, 60, 58, 57, 56, 55, 54, 57, 59,
+ 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 64, 61,
+ 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78,
+ 79, 82, 85, 86, 89, 90, 65, 61, 60, 58, 57, 56, 55, 58, 59,
+ 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91,
+ 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73,
+ 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, 68, 64, 63, 61, 60,
+ 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86,
+ 88, 89, 94, 94, 95, 97, 68, 65, 64, 62, 61, 60, 58, 59, 61,
+ 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95,
+ 96, 97, 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67,
+ 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99,
+ 101, 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71,
+ 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103,
+ 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74,
+ 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+ 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73,
+ 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104,
+ 106, 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72,
+ 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104,
+ 106, 106, 108}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 34, 35,
+ 32, 33, 33, 33, 34, 34, 36, 36, 34, 34, 34, 33, 35, 35,
+ 37, 38, 39, 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 36,
+ 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 39, 38, 38, 37,
+ 39, 39, 40, 42, 45, 49, 50, 54, 41, 40, 39, 38, 40, 40,
+ 41, 43, 46, 50, 52, 55, 57, 44, 42, 42, 41, 42, 42, 42,
+ 44, 47, 52, 54, 58, 60, 63, 47, 45, 45, 44, 44, 45, 45,
+ 47, 50, 55, 56, 60, 62, 66, 69, 48, 46, 45, 44, 45, 45,
+ 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 54, 51, 50, 49,
+ 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 56,
+ 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77,
+ 78, 84, 86, 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64,
+ 69, 71, 75, 79, 80, 87, 89, 92, 64, 61, 60, 58, 58, 58,
+ 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102,
+ 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79,
+ 84, 85, 92, 94, 98, 103, 105, 71, 68, 67, 65, 64, 64, 63,
+ 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111,
+ 117, 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83,
+ 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 80, 76, 74, 72,
+ 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106,
+ 110, 116, 118, 125, 128, 134, 82, 78, 76, 74, 73, 73, 71, 73,
+ 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127,
+ 131, 136, 139, 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81,
+ 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139,
+ 140, 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92,
+ 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+ 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98,
+ 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+ 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98,
+ 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156,
+ 162, 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97,
+ 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158,
+ 161, 162, 168, 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91,
+ 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150,
+ 152, 162, 164, 168, 168, 174, 100, 95, 95, 90, 90, 89, 89, 86,
+ 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137,
+ 137, 147, 148, 157, 158, 169, 170, 174, 175, 181},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34, 37,
+ 33, 34, 35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38, 40,
+ 40, 41, 43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46, 47,
+ 47, 48, 49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48, 49,
+ 52, 53, 54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
+ 55, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61,
+ 61, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61,
+ 63, 63, 66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58,
+ 59, 62, 64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49, 52,
+ 55, 55, 58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51, 51,
+ 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75,
+ 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66,
+ 67, 70, 71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53, 55,
+ 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61, 58,
+ 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73,
+ 74, 76, 79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55, 57,
+ 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89,
+ 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70,
+ 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58, 57,
+ 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82,
+ 83, 86, 88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58, 60,
+ 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92,
+ 93, 93, 95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65,
+ 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95,
+ 97, 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69,
+ 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99,
+ 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72,
+ 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100,
+ 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71,
+ 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100,
+ 102, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70,
+ 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100,
+ 101, 101, 104}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 33, 34, 35, 35, 33, 33, 33, 33, 34, 35,
+ 36, 36, 38, 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 36,
+ 35, 35, 34, 35, 36, 37, 38, 42, 42, 48, 36, 35, 35, 34,
+ 35, 36, 38, 38, 42, 43, 48, 49, 39, 38, 38, 37, 38, 39,
+ 40, 40, 44, 45, 50, 51, 54, 41, 39, 39, 38, 39, 40, 40,
+ 41, 45, 46, 51, 52, 55, 56, 44, 42, 42, 41, 41, 42, 42,
+ 42, 46, 47, 54, 54, 58, 59, 63, 46, 44, 44, 42, 43, 44,
+ 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 48, 46, 46, 44,
+ 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71, 52,
+ 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70,
+ 72, 74, 78, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60,
+ 60, 65, 67, 71, 74, 76, 80, 82, 58, 56, 55, 53, 53, 53,
+ 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91,
+ 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70,
+ 75, 78, 80, 85, 87, 91, 92, 65, 62, 61, 59, 59, 59, 58,
+ 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98,
+ 105, 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74,
+ 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 71, 68, 67, 65,
+ 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95,
+ 97, 103, 103, 111, 112, 117, 74, 71, 70, 68, 67, 67, 66, 65,
+ 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114,
+ 115, 120, 123, 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79,
+ 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128,
+ 134, 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85,
+ 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+ 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88,
+ 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+ 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91,
+ 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144,
+ 147, 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91,
+ 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145,
+ 148, 149, 153, 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85,
+ 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138,
+ 140, 148, 150, 153, 154, 159, 93, 88, 88, 84, 84, 83, 83, 80,
+ 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126,
+ 126, 135, 136, 144, 144, 155, 155, 159, 159, 164},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34,
+ 35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43,
+ 46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45,
+ 46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47,
+ 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46,
+ 46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53,
+ 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56,
+ 58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+ 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+ 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
+ 65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62,
+ 63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59,
+ 62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55,
+ 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50,
+ 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55,
+ 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71,
+ 73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61,
+ 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54,
+ 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80,
+ 83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67,
+ 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57,
+ 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83,
+ 86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64,
+ 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63,
+ 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76,
+ 79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58,
+ 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
+ 89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62,
+ 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94,
+ 96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68,
+ 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97,
+ 99}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 34, 34, 35, 32, 32, 32, 32, 32, 34,
+ 34, 35, 35, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 34,
+ 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 36, 35, 35, 34,
+ 34, 36, 36, 38, 38, 42, 42, 48, 36, 35, 35, 34, 34, 36,
+ 36, 38, 38, 42, 42, 48, 48, 39, 38, 38, 37, 37, 39, 39,
+ 40, 40, 45, 45, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39,
+ 40, 40, 45, 45, 50, 50, 54, 54, 44, 42, 42, 41, 41, 42,
+ 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 44, 42, 42, 41,
+ 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 48,
+ 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61,
+ 67, 67, 71, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51,
+ 57, 57, 61, 61, 67, 67, 71, 71, 54, 51, 51, 49, 49, 50,
+ 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+ 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65,
+ 65, 71, 71, 76, 76, 82, 82, 59, 56, 56, 54, 54, 54, 54,
+ 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87,
+ 92, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64,
+ 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 65, 62, 62, 59,
+ 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85,
+ 85, 92, 92, 98, 98, 105, 65, 62, 62, 59, 59, 59, 59, 58,
+ 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98,
+ 98, 105, 105, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68,
+ 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111,
+ 117, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73,
+ 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84,
+ 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84,
+ 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+ 134, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81,
+ 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128,
+ 137, 137, 140, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76,
+ 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121,
+ 128, 128, 137, 137, 140, 140, 87, 83, 83, 79, 79, 77, 77, 75,
+ 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116,
+ 116, 124, 124, 132, 132, 141, 141, 144, 144, 149},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34,
+ 34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43,
+ 43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45,
+ 45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47,
+ 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47,
+ 48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
+ 53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+ 54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
+ 61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60,
+ 60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57,
+ 57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50,
+ 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49,
+ 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52,
+ 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65,
+ 68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56,
+ 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51,
+ 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73,
+ 76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61,
+ 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53,
+ 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75,
+ 79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60,
+ 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60,
+ 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71,
+ 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57,
+ 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
+ 83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58,
+ 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90,
+ 90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66,
+ 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93,
+ 95}},
+ {{32, 31, 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 32, 32, 33,
+ 34, 34, 35, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 34,
+ 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 34, 34, 34, 33,
+ 33, 34, 35, 35, 37, 37, 39, 39, 35, 35, 35, 34, 34, 35,
+ 36, 36, 38, 38, 42, 42, 46, 36, 35, 35, 34, 34, 35, 36,
+ 37, 38, 38, 42, 42, 47, 48, 38, 37, 37, 36, 36, 37, 38,
+ 38, 39, 40, 44, 44, 48, 50, 51, 39, 38, 38, 38, 37, 38,
+ 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 41, 40, 40, 39,
+ 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57, 44,
+ 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56,
+ 58, 60, 63, 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48,
+ 48, 53, 54, 57, 58, 60, 64, 65, 48, 46, 46, 45, 44, 45,
+ 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71,
+ 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57,
+ 59, 61, 63, 67, 68, 71, 71, 53, 51, 51, 49, 49, 49, 49,
+ 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75,
+ 81, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59,
+ 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 57, 55, 55, 53,
+ 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74,
+ 75, 79, 79, 85, 85, 89, 59, 56, 56, 54, 54, 54, 54, 54,
+ 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86,
+ 87, 90, 92, 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60,
+ 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95,
+ 98, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67,
+ 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105,
+ 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69,
+ 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108,
+ 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73,
+ 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113,
+ 117, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72,
+ 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111,
+ 113, 118, 119, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74,
+ 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110,
+ 114, 118, 120, 125, 126, 134, 80, 76, 76, 73, 72, 72, 71, 70,
+ 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103,
+ 104, 108, 110, 114, 118, 120, 125, 126, 134, 134},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32,
+ 33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38,
+ 40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42,
+ 43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42,
+ 42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47,
+ 47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50,
+ 50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
+ 53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
+ 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56,
+ 57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54,
+ 55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50,
+ 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47,
+ 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50,
+ 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61,
+ 63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54,
+ 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49,
+ 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68,
+ 70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58,
+ 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51,
+ 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70,
+ 72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56,
+ 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57,
+ 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66,
+ 68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54,
+ 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
+ 76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
+ 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
+ 85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60,
+ 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+ 89}},
+ {{32, 31, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 31, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 32,
+ 32, 33, 34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34, 35, 35,
+ 36, 36, 38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
+ 39, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
+ 48, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46,
+ 50, 50, 52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44,
+ 45, 47, 50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40, 40, 40,
+ 41, 41, 45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42, 42, 41,
+ 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63,
+ 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54,
+ 54, 57, 58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44, 45, 45,
+ 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 48, 47,
+ 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60,
+ 61, 63, 67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47, 47, 47,
+ 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75,
+ 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60,
+ 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51, 50, 49,
+ 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71,
+ 71, 75, 76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53, 53, 53,
+ 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82,
+ 86, 86, 90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57,
+ 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91,
+ 92, 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62,
+ 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68,
+ 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68,
+ 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105,
+ 105, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69,
+ 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105,
+ 109, 109, 114},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31,
+ 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35,
+ 38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40,
+ 42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41,
+ 41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44,
+ 45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47,
+ 49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
+ 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
+ 54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53,
+ 54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51,
+ 53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45,
+ 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46,
+ 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49,
+ 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58,
+ 60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50,
+ 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47,
+ 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63,
+ 65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54,
+ 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49,
+ 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65,
+ 66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52,
+ 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53,
+ 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60,
+ 63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51,
+ 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
+ 70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50,
+ 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73,
+ 74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56,
+ 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78,
+ 80}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33,
+ 34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
+ 37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+ 39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+ 48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47,
+ 50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45,
+ 45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40,
+ 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40,
+ 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43,
+ 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56,
+ 58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45,
+ 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44,
+ 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66,
+ 66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51,
+ 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45,
+ 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68,
+ 68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51,
+ 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52,
+ 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63,
+ 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50,
+ 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
+ 76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
+ 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
+ 87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58,
+ 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+ 92},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31,
+ 31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35,
+ 35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36,
+ 37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38,
+ 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41,
+ 43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
+ 47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+ 48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+ 53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51,
+ 53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49,
+ 49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+ 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45,
+ 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47,
+ 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55,
+ 55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+ 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59,
+ 59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50,
+ 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46,
+ 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60,
+ 60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48,
+ 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50,
+ 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56,
+ 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48,
+ 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
+ 63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
+ 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
+ 68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52,
+ 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+ 71}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+ 36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+ 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
+ 41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42,
+ 42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+ 40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36,
+ 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36,
+ 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39,
+ 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49,
+ 50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40,
+ 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55,
+ 55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44,
+ 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41,
+ 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58,
+ 58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43,
+ 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46,
+ 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55,
+ 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44,
+ 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
+ 63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45,
+ 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68,
+ 70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48,
+ 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74,
+ 77},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+ 31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35,
+ 35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36,
+ 37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40,
+ 40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+ 47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+ 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
+ 49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+ 50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48,
+ 49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47,
+ 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46,
+ 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47,
+ 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52,
+ 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+ 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45,
+ 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55,
+ 55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+ 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55,
+ 55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49,
+ 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53,
+ 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
+ 58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60,
+ 61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46,
+ 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63,
+ 64}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+ 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+ 37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37,
+ 37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36,
+ 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35,
+ 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34,
+ 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35,
+ 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42,
+ 42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36,
+ 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35,
+ 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+ 48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39,
+ 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38,
+ 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50,
+ 50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39,
+ 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39,
+ 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45,
+ 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39,
+ 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
+ 52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
+ 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
+ 58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
+ 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+ 63},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31,
+ 32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35,
+ 35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41,
+ 41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+ 47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+ 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
+ 47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47,
+ 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46,
+ 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45,
+ 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44,
+ 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48,
+ 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50,
+ 50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+ 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47,
+ 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+ 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47,
+ 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46,
+ 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53,
+ 53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
+ 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48,
+ 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49,
+ 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45,
+ 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
+ 53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
+ 55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+ 58}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31,
+ 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+ 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36,
+ 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35,
+ 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34,
+ 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39,
+ 39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35,
+ 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40,
+ 41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36,
+ 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35,
+ 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38,
+ 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35,
+ 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
+ 42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34,
+ 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47,
+ 48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37,
+ 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49,
+ 50},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+ 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31,
+ 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+ 37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+ 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+ 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+ 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
+ 46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+ 45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+ 43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40,
+ 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40,
+ 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41,
+ 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47,
+ 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44,
+ 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48,
+ 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45,
+ 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49,
+ 49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+ 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48,
+ 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47,
+ 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+ 50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+ 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52,
+ 53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
+ 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
+ 53}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
+ 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
+ 38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33,
+ 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+ 39},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
+ 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37,
+ 38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+ 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35,
+ 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36,
+ 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41,
+ 41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39,
+ 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+ 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40,
+ 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+ 47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40,
+ 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39,
+ 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44,
+ 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41,
+ 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
+ 47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
+ 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+ 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+ 37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+ 37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
+ 38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+ 39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+ 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40,
+ 40}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32}}};
diff --git a/src/reconstruction.cc b/src/reconstruction.cc
new file mode 100644
index 0000000..1aa1233
--- /dev/null
+++ b/src/reconstruction.cc
@@ -0,0 +1,190 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/reconstruction.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+// Maps TransformType to dsp::Transform1D for the row transforms.
+constexpr dsp::Transform1D kRowTransform[kNumTransformTypes] = {
+ dsp::k1DTransformDct, dsp::k1DTransformAdst,
+ dsp::k1DTransformDct, dsp::k1DTransformAdst,
+ dsp::k1DTransformAdst, dsp::k1DTransformDct,
+ dsp::k1DTransformAdst, dsp::k1DTransformAdst,
+ dsp::k1DTransformAdst, dsp::k1DTransformIdentity,
+ dsp::k1DTransformIdentity, dsp::k1DTransformDct,
+ dsp::k1DTransformIdentity, dsp::k1DTransformAdst,
+ dsp::k1DTransformIdentity, dsp::k1DTransformAdst};
+
+// Maps TransformType to dsp::Transform1D for the column transforms.
+constexpr dsp::Transform1D kColumnTransform[kNumTransformTypes] = {
+ dsp::k1DTransformDct, dsp::k1DTransformDct,
+ dsp::k1DTransformAdst, dsp::k1DTransformAdst,
+ dsp::k1DTransformDct, dsp::k1DTransformAdst,
+ dsp::k1DTransformAdst, dsp::k1DTransformAdst,
+ dsp::k1DTransformAdst, dsp::k1DTransformIdentity,
+ dsp::k1DTransformDct, dsp::k1DTransformIdentity,
+ dsp::k1DTransformAdst, dsp::k1DTransformIdentity,
+ dsp::k1DTransformAdst, dsp::k1DTransformIdentity};
+
+dsp::TransformSize1D Get1DTransformSize(int size_log2) {
+ return static_cast<dsp::TransformSize1D>(size_log2 - 2);
+}
+
+// Returns the number of rows to process based on |non_zero_coeff_count|. The
+// transform loops process either 4 or a multiple of 8 rows. Use the
+// TransformClass derived from |tx_type| to determine the scan order.
+template <int tx_width>
+int GetNumRows(TransformType tx_type, int tx_height, int non_zero_coeff_count) {
+ const TransformClass tx_class = GetTransformClass(tx_type);
+
+ switch (tx_class) {
+ case kTransformClass2D:
+ if (tx_width == 4) {
+ if (non_zero_coeff_count <= 13) return 4;
+ if (non_zero_coeff_count <= 29) return 8;
+ }
+ if (tx_width == 8) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if ((non_zero_coeff_count <= 14) & (tx_height > 8)) return 4;
+ if (non_zero_coeff_count <= 43) return 8;
+ if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
+ }
+ if (tx_width == 16) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if ((non_zero_coeff_count <= 14) & (tx_height > 16)) return 4;
+ if (non_zero_coeff_count <= 36) return 8;
+ if ((non_zero_coeff_count <= 44) & (tx_height > 16)) return 8;
+ if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
+ }
+ if (tx_width == 32) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if (non_zero_coeff_count <= 36) return 8;
+ if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
+ }
+ break;
+
+ case kTransformClassHorizontal:
+ if (non_zero_coeff_count <= 4) return 4;
+ if (non_zero_coeff_count <= 8) return 8;
+ if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
+ break;
+
+ default:
+ assert(tx_class == kTransformClassVertical);
+ if (tx_width == 4) {
+ if (non_zero_coeff_count <= 16) return 4;
+ if (non_zero_coeff_count <= 32) return 8;
+ }
+ if (tx_width == 8) {
+ if (non_zero_coeff_count <= 32) return 4;
+ if (non_zero_coeff_count <= 64) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 8x8: 63, 8x16: 127.
+ if (non_zero_coeff_count <= 128) return 16;
+ if (non_zero_coeff_count <= 192) return 24;
+ }
+ if (tx_width == 16) {
+ if (non_zero_coeff_count <= 64) return 4;
+ if (non_zero_coeff_count <= 128) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 16x8: 127, 16x16: 255.
+ if (non_zero_coeff_count <= 256) return 16;
+ if (non_zero_coeff_count <= 384) return 24;
+ }
+ if (tx_width == 32) {
+ if (non_zero_coeff_count <= 128) return 4;
+ if (non_zero_coeff_count <= 256) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 32x8 is 255, 32x16 is 511.
+ if ((non_zero_coeff_count <= 512)) return 16;
+ if ((non_zero_coeff_count <= 768)) return 24;
+ }
+ break;
+ }
+ return (tx_width >= 16) ? std::min(tx_height, 32) : tx_height;
+}
+
+} // namespace
+
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless, Residual* const buffer,
+ int start_x, int start_y, Array2DView<Pixel>* frame,
+ int non_zero_coeff_count) {
+ static_assert(sizeof(Residual) == 2 || sizeof(Residual) == 4, "");
+ const int tx_width_log2 = kTransformWidthLog2[tx_size];
+ const int tx_height_log2 = kTransformHeightLog2[tx_size];
+
+ int tx_height = (non_zero_coeff_count == 1) ? 1 : kTransformHeight[tx_size];
+ if (tx_height > 4) {
+ static constexpr int (*kGetNumRows[])(TransformType tx_type, int tx_height,
+ int non_zero_coeff_count) = {
+ &GetNumRows<4>, &GetNumRows<8>, &GetNumRows<16>, &GetNumRows<32>,
+ &GetNumRows<32>};
+ tx_height = kGetNumRows[tx_width_log2 - 2](tx_type, tx_height,
+ non_zero_coeff_count);
+ }
+ assert(tx_height <= 32);
+
+ // Row transform.
+ const dsp::TransformSize1D row_transform_size =
+ Get1DTransformSize(tx_width_log2);
+ const dsp::Transform1D row_transform =
+ lossless ? dsp::k1DTransformWht : kRowTransform[tx_type];
+ const dsp::InverseTransformAddFunc row_transform_func =
+ dsp.inverse_transforms[row_transform][row_transform_size][dsp::kRow];
+ assert(row_transform_func != nullptr);
+
+ row_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+ frame);
+
+ // Column transform.
+ const dsp::TransformSize1D column_transform_size =
+ Get1DTransformSize(tx_height_log2);
+ const dsp::Transform1D column_transform =
+ lossless ? dsp::k1DTransformWht : kColumnTransform[tx_type];
+ const dsp::InverseTransformAddFunc column_transform_func =
+ dsp.inverse_transforms[column_transform][column_transform_size]
+ [dsp::kColumn];
+ assert(column_transform_func != nullptr);
+
+ column_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+ frame);
+}
+
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless, int16_t* buffer,
+ int start_x, int start_y, Array2DView<uint8_t>* frame,
+ int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless, int32_t* buffer,
+ int start_x, int start_y,
+ Array2DView<uint16_t>* frame,
+ int non_zero_coeff_count);
+#endif
+
+} // namespace libgav1
diff --git a/src/reconstruction.h b/src/reconstruction.h
new file mode 100644
index 0000000..6d5b115
--- /dev/null
+++ b/src/reconstruction.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RECONSTRUCTION_H_
+#define LIBGAV1_SRC_RECONSTRUCTION_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the frame for the
+// transform block size |tx_size| starting at position |start_x| and |start_y|.
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless, Residual* buffer,
+ int start_x, int start_y, Array2DView<Pixel>* frame,
+ int non_zero_coeff_count);
+
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless,
+ int16_t* buffer, int start_x, int start_y,
+ Array2DView<uint8_t>* frame,
+ int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless,
+ int32_t* buffer, int start_x, int start_y,
+ Array2DView<uint16_t>* frame,
+ int non_zero_coeff_count);
+#endif
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_RECONSTRUCTION_H_
diff --git a/src/residual_buffer_pool.cc b/src/residual_buffer_pool.cc
new file mode 100644
index 0000000..e166392
--- /dev/null
+++ b/src/residual_buffer_pool.cc
@@ -0,0 +1,142 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/residual_buffer_pool.h"
+
+#include <mutex> // NOLINT (unapproved c++11 header)
+#include <utility>
+
+namespace libgav1 {
+namespace {
+
+// The maximum queue size is derived using the following formula:
+// ((sb_size * sb_size) / 16) + (2 * (((sb_size / x) * (sb_size / y)) / 16)).
+// Where:
+// sb_size is the superblock size (64 or 128).
+// 16 is 4*4 which is kMinTransformWidth * kMinTransformHeight.
+// x is subsampling_x + 1.
+// y is subsampling_y + 1.
+// The first component is for the Y plane and the second component is for the U
+// and V planes.
+// For example, for 128x128 superblocks with 422 subsampling the size is:
+// ((128 * 128) / 16) + (2 * (((128 / 2) * (128 / 1)) / 16)) = 2048.
+//
+// First dimension: use_128x128_superblock.
+// Second dimension: subsampling_x.
+// Third dimension: subsampling_y.
+constexpr int kMaxQueueSize[2][2][2] = {
+ // 64x64 superblocks.
+ {
+ {768, 512},
+ {512, 384},
+ },
+ // 128x128 superblocks.
+ {
+ {3072, 2048},
+ {2048, 1536},
+ },
+};
+
+} // namespace
+
+ResidualBufferStack::~ResidualBufferStack() {
+ while (top_ != nullptr) {
+ ResidualBuffer* top = top_;
+ top_ = top_->next_;
+ delete top;
+ }
+}
+
+void ResidualBufferStack::Push(std::unique_ptr<ResidualBuffer> buffer) {
+ buffer->next_ = top_;
+ top_ = buffer.release();
+ ++num_buffers_;
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferStack::Pop() {
+ std::unique_ptr<ResidualBuffer> top;
+ if (top_ != nullptr) {
+ top.reset(top_);
+ top_ = top_->next_;
+ top->next_ = nullptr;
+ --num_buffers_;
+ }
+ return top;
+}
+
+void ResidualBufferStack::Swap(ResidualBufferStack* other) {
+ std::swap(top_, other->top_);
+ std::swap(num_buffers_, other->num_buffers_);
+}
+
+ResidualBufferPool::ResidualBufferPool(bool use_128x128_superblock,
+ int subsampling_x, int subsampling_y,
+ size_t residual_size)
+ : buffer_size_(GetResidualBufferSize(
+ use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+ subsampling_x, subsampling_y, residual_size)),
+ queue_size_(kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+ [subsampling_x][subsampling_y]) {}
+
+void ResidualBufferPool::Reset(bool use_128x128_superblock, int subsampling_x,
+ int subsampling_y, size_t residual_size) {
+ const size_t buffer_size = GetResidualBufferSize(
+ use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+ subsampling_x, subsampling_y, residual_size);
+ const int queue_size = kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+ [subsampling_x][subsampling_y];
+ if (buffer_size == buffer_size_ && queue_size == queue_size_) {
+ // The existing buffers (if any) are still valid, so don't do anything.
+ return;
+ }
+ buffer_size_ = buffer_size;
+ queue_size_ = queue_size;
+ // The existing buffers (if any) are no longer valid since the buffer size or
+ // the queue size has changed. Clear the stack.
+ ResidualBufferStack buffers;
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ // Move the buffers in the stack to the local variable |buffers| and clear
+ // the stack.
+ buffers.Swap(&buffers_);
+ // Release mutex_ before freeing the buffers.
+ }
+ // As the local variable |buffers| goes out of scope, its destructor frees
+ // the buffers that were in the stack.
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferPool::Get() {
+ std::unique_ptr<ResidualBuffer> buffer = nullptr;
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ buffer = buffers_.Pop();
+ }
+ if (buffer == nullptr) {
+ buffer = ResidualBuffer::Create(buffer_size_, queue_size_);
+ }
+ return buffer;
+}
+
+void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
+ buffer->transform_parameters()->Reset();
+ std::lock_guard<std::mutex> lock(mutex_);
+ buffers_.Push(std::move(buffer));
+}
+
+size_t ResidualBufferPool::Size() const {
+ std::lock_guard<std::mutex> lock(mutex_);
+ return buffers_.Size();
+}
+
+} // namespace libgav1
diff --git a/src/residual_buffer_pool.h b/src/residual_buffer_pool.h
new file mode 100644
index 0000000..f7bc75d
--- /dev/null
+++ b/src/residual_buffer_pool.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+#define LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// A simple fixed size queue implementation to hold the transform parameters
+// when |Tile::split_parse_and_decode_| is true. We don't have to do any
+// boundary checks since we always push data into the queue before accessing it.
+class TransformParameterQueue {
+ public:
+ TransformParameterQueue() = default;
+
+ // Move only.
+ TransformParameterQueue(TransformParameterQueue&& other) = default;
+ TransformParameterQueue& operator=(TransformParameterQueue&& other) = default;
+
+ LIBGAV1_MUST_USE_RESULT bool Init(int max_size) {
+ max_size_ = max_size;
+ // No initialization is necessary since the data will be always written to
+ // before being read.
+ non_zero_coeff_count_.reset(new (std::nothrow) int16_t[max_size_]);
+ tx_type_.reset(new (std::nothrow) TransformType[max_size_]);
+ return non_zero_coeff_count_ != nullptr && tx_type_ != nullptr;
+ }
+
+ // Adds the |non_zero_coeff_count| and the |tx_type| to the back of the queue.
+ void Push(int non_zero_coeff_count, TransformType tx_type) {
+ assert(back_ < max_size_);
+ non_zero_coeff_count_[back_] = non_zero_coeff_count;
+ tx_type_[back_++] = tx_type;
+ }
+
+ // Returns the non_zero_coeff_count at the front of the queue.
+ int16_t NonZeroCoeffCount() const {
+ assert(front_ != back_);
+ return non_zero_coeff_count_[front_];
+ }
+
+ // Returns the tx_type at the front of the queue.
+ TransformType Type() const {
+ assert(front_ != back_);
+ return tx_type_[front_];
+ }
+
+ // Removes the |non_zero_coeff_count| and the |tx_type| from the front of the
+ // queue.
+ void Pop() {
+ assert(front_ != back_);
+ ++front_;
+ }
+
+ // Clears the queue.
+ void Reset() {
+ front_ = 0;
+ back_ = 0;
+ }
+
+ // Used only in the tests. Returns the number of elements in the queue.
+ int Size() const { return back_ - front_; }
+
+ private:
+ int max_size_ = 0;
+ std::unique_ptr<int16_t[]> non_zero_coeff_count_;
+ std::unique_ptr<TransformType[]> tx_type_;
+ int front_ = 0;
+ int back_ = 0;
+};
+
+// This class is used for parsing and decoding a superblock. Members of this
+// class are populated in the "parse" step and consumed in the "decode" step.
+class ResidualBuffer : public Allocable {
+ public:
+ static std::unique_ptr<ResidualBuffer> Create(size_t buffer_size,
+ int queue_size) {
+ std::unique_ptr<ResidualBuffer> buffer(new (std::nothrow) ResidualBuffer);
+ if (buffer != nullptr) {
+ buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
+ if (buffer->buffer_ == nullptr ||
+ !buffer->transform_parameters_.Init(queue_size)) {
+ buffer = nullptr;
+ }
+ }
+ return buffer;
+ }
+
+ // Move only.
+ ResidualBuffer(ResidualBuffer&& other) = default;
+ ResidualBuffer& operator=(ResidualBuffer&& other) = default;
+
+ // Buffer used to store the residual values.
+ uint8_t* buffer() { return buffer_.get(); }
+ // Queue used to store the transform parameters.
+ TransformParameterQueue* transform_parameters() {
+ return &transform_parameters_;
+ }
+
+ private:
+ friend class ResidualBufferStack;
+
+ ResidualBuffer() = default;
+
+ AlignedUniquePtr<uint8_t> buffer_;
+ TransformParameterQueue transform_parameters_;
+ // Used by ResidualBufferStack to form a chain of ResidualBuffers.
+ ResidualBuffer* next_ = nullptr;
+};
+
+// A LIFO stack of ResidualBuffers. Owns the buffers in the stack.
+class ResidualBufferStack {
+ public:
+ ResidualBufferStack() = default;
+
+ // Not copyable or movable
+ ResidualBufferStack(const ResidualBufferStack&) = delete;
+ ResidualBufferStack& operator=(const ResidualBufferStack&) = delete;
+
+ ~ResidualBufferStack();
+
+ // Pushes |buffer| to the top of the stack.
+ void Push(std::unique_ptr<ResidualBuffer> buffer);
+
+ // If the stack is non-empty, returns the buffer at the top of the stack and
+ // removes it from the stack. If the stack is empty, returns nullptr.
+ std::unique_ptr<ResidualBuffer> Pop();
+
+ // Swaps the contents of this stack and |other|.
+ void Swap(ResidualBufferStack* other);
+
+ // Returns the number of buffers in the stack.
+ size_t Size() const { return num_buffers_; }
+
+ private:
+ // A singly-linked list of ResidualBuffers, chained together using the next_
+ // field of ResidualBuffer.
+ ResidualBuffer* top_ = nullptr;
+ size_t num_buffers_ = 0;
+};
+
+// Utility class used to manage the residual buffers (and the transform
+// parameters) used for multi-threaded decoding. This class uses a stack to
+// store the buffers for better cache locality. Since buffers used more recently
+// are more likely to be in the cache. All functions in this class are
+// thread-safe.
+class ResidualBufferPool : public Allocable {
+ public:
+ ResidualBufferPool(bool use_128x128_superblock, int subsampling_x,
+ int subsampling_y, size_t residual_size);
+
+ // Recomputes |buffer_size_| and invalidates the existing buffers if
+ // necessary.
+ void Reset(bool use_128x128_superblock, int subsampling_x, int subsampling_y,
+ size_t residual_size);
+ // Gets a residual buffer. The buffer is guaranteed to be large enough to
+ // store the residual values for one superblock whose parameters are the same
+ // as the constructor or the last call to Reset(). If there are free buffers
+ // in the stack, it returns one from the stack, otherwise a new buffer is
+ // allocated.
+ std::unique_ptr<ResidualBuffer> Get();
+ // Returns the |buffer| back to the pool (by appending it to the stack).
+ // Subsequent calls to Get() may re-use this buffer.
+ void Release(std::unique_ptr<ResidualBuffer> buffer);
+
+ // Used only in the tests. Returns the number of buffers in the stack.
+ size_t Size() const;
+
+ private:
+ mutable std::mutex mutex_;
+ ResidualBufferStack buffers_ LIBGAV1_GUARDED_BY(mutex_);
+ size_t buffer_size_;
+ int queue_size_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
diff --git a/src/scan_tables.inc b/src/scan_tables.inc
new file mode 100644
index 0000000..f7c9231
--- /dev/null
+++ b/src/scan_tables.inc
@@ -0,0 +1,440 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains all the scan order tables.
+
+constexpr uint16_t kDefaultScan4x4[16] = {0, 1, 4, 8, 5, 2, 3, 6,
+ 9, 12, 13, 10, 7, 11, 14, 15};
+
+constexpr uint16_t kColumnScan4x4[16] = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+
+constexpr uint16_t kRowScan4x4[16] = {0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15};
+
+constexpr uint16_t kDefaultScan4x8[32] = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31};
+
+constexpr uint16_t kColumnScan4x8[32] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29,
+ 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31};
+
+constexpr uint16_t kRowScan4x8[32] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x4[32] = {
+ 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19,
+ 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31};
+
+constexpr uint16_t kColumnScan8x4[32] = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31};
+
+constexpr uint16_t kRowScan8x4[32] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x8[64] = {
+ 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63};
+
+constexpr uint16_t kColumnScan8x8[64] = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57,
+ 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+ 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+ 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63};
+
+constexpr uint16_t kRowScan8x8[64] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x16[128] = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
+ 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
+ 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23,
+ 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80,
+ 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89,
+ 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98,
+ 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107,
+ 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110,
+ 117, 124, 111, 118, 125, 119, 126, 127};
+
+constexpr uint16_t kColumnScan8x16[128] = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
+ 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
+ 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
+ 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123,
+ 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+ 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+ 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+ 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127};
+
+constexpr uint16_t kRowScan8x16[128] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x8[128] = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80,
+ 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67,
+ 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69,
+ 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71,
+ 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73,
+ 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75,
+ 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62,
+ 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127};
+
+constexpr uint16_t kColumnScan16x8[128] = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113,
+ 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115,
+ 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117,
+ 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119,
+ 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121,
+ 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+ 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+ 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127};
+
+constexpr uint16_t kRowScan16x8[128] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x16[256] = {
+ 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4,
+ 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22,
+ 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100,
+ 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27,
+ 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46,
+ 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+ 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94,
+ 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+ 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+ 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+ 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+ 255};
+
+constexpr uint16_t kColumnScan16x16[256] = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+ 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+ 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+ 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+ 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+ 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+ 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+ 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+ 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+ 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+ 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+ 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+ 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+ 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255};
+
+constexpr uint16_t kRowScan16x16[256] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255};
+
+constexpr uint16_t kDefaultScan16x32[512] = {
+ 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64,
+ 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22,
+ 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70,
+ 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177,
+ 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226,
+ 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227,
+ 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228,
+ 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229,
+ 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230,
+ 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+ 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+ 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+ 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+ 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+ 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+ 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+ 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+ 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+ 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+ 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+ 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+ 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+ 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+ 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+ 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+ 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+ 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+ 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+ 510, 511};
+
+constexpr uint16_t kDefaultScan32x16[512] = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4,
+ 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193,
+ 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8,
+ 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196,
+ 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104,
+ 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43,
+ 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13,
+ 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14,
+ 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46,
+ 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+ 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+ 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+ 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+ 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+ 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238,
+ 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270,
+ 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302,
+ 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334,
+ 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366,
+ 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398,
+ 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430,
+ 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462,
+ 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494,
+ 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29,
+ 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61,
+ 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+ 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+ 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+ 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+ 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+ 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+ 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+ 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+ 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+ 479, 511};
+
+constexpr uint16_t kDefaultScan32x32[1024] = {
+ 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 128, 97, 66,
+ 35, 4, 5, 36, 67, 98, 129, 160, 192, 161, 130, 99, 68,
+ 37, 6, 7, 38, 69, 100, 131, 162, 193, 224, 256, 225, 194,
+ 163, 132, 101, 70, 39, 8, 9, 40, 71, 102, 133, 164, 195,
+ 226, 257, 288, 320, 289, 258, 227, 196, 165, 134, 103, 72, 41,
+ 10, 11, 42, 73, 104, 135, 166, 197, 228, 259, 290, 321, 352,
+ 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12,
+ 13, 44, 75, 106, 137, 168, 199, 230, 261, 292, 323, 354, 385,
+ 416, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107,
+ 76, 45, 14, 15, 46, 77, 108, 139, 170, 201, 232, 263, 294,
+ 325, 356, 387, 418, 449, 480, 512, 481, 450, 419, 388, 357, 326,
+ 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 17, 48, 79,
+ 110, 141, 172, 203, 234, 265, 296, 327, 358, 389, 420, 451, 482,
+ 513, 544, 576, 545, 514, 483, 452, 421, 390, 359, 328, 297, 266,
+ 235, 204, 173, 142, 111, 80, 49, 18, 19, 50, 81, 112, 143,
+ 174, 205, 236, 267, 298, 329, 360, 391, 422, 453, 484, 515, 546,
+ 577, 608, 640, 609, 578, 547, 516, 485, 454, 423, 392, 361, 330,
+ 299, 268, 237, 206, 175, 144, 113, 82, 51, 20, 21, 52, 83,
+ 114, 145, 176, 207, 238, 269, 300, 331, 362, 393, 424, 455, 486,
+ 517, 548, 579, 610, 641, 672, 704, 673, 642, 611, 580, 549, 518,
+ 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115,
+ 84, 53, 22, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302,
+ 333, 364, 395, 426, 457, 488, 519, 550, 581, 612, 643, 674, 705,
+ 736, 768, 737, 706, 675, 644, 613, 582, 551, 520, 489, 458, 427,
+ 396, 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, 55, 24,
+ 25, 56, 87, 118, 149, 180, 211, 242, 273, 304, 335, 366, 397,
+ 428, 459, 490, 521, 552, 583, 614, 645, 676, 707, 738, 769, 800,
+ 832, 801, 770, 739, 708, 677, 646, 615, 584, 553, 522, 491, 460,
+ 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57,
+ 26, 27, 58, 89, 120, 151, 182, 213, 244, 275, 306, 337, 368,
+ 399, 430, 461, 492, 523, 554, 585, 616, 647, 678, 709, 740, 771,
+ 802, 833, 864, 896, 865, 834, 803, 772, 741, 710, 679, 648, 617,
+ 586, 555, 524, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214,
+ 183, 152, 121, 90, 59, 28, 29, 60, 91, 122, 153, 184, 215,
+ 246, 277, 308, 339, 370, 401, 432, 463, 494, 525, 556, 587, 618,
+ 649, 680, 711, 742, 773, 804, 835, 866, 897, 928, 960, 929, 898,
+ 867, 836, 805, 774, 743, 712, 681, 650, 619, 588, 557, 526, 495,
+ 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,
+ 61, 30, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341,
+ 372, 403, 434, 465, 496, 527, 558, 589, 620, 651, 682, 713, 744,
+ 775, 806, 837, 868, 899, 930, 961, 992, 993, 962, 931, 900, 869,
+ 838, 807, 776, 745, 714, 683, 652, 621, 590, 559, 528, 497, 466,
+ 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63,
+ 95, 126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467,
+ 498, 529, 560, 591, 622, 653, 684, 715, 746, 777, 808, 839, 870,
+ 901, 932, 963, 994, 995, 964, 933, 902, 871, 840, 809, 778, 747,
+ 716, 685, 654, 623, 592, 561, 530, 499, 468, 437, 406, 375, 344,
+ 313, 282, 251, 220, 189, 158, 127, 159, 190, 221, 252, 283, 314,
+ 345, 376, 407, 438, 469, 500, 531, 562, 593, 624, 655, 686, 717,
+ 748, 779, 810, 841, 872, 903, 934, 965, 996, 997, 966, 935, 904,
+ 873, 842, 811, 780, 749, 718, 687, 656, 625, 594, 563, 532, 501,
+ 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 223, 254, 285,
+ 316, 347, 378, 409, 440, 471, 502, 533, 564, 595, 626, 657, 688,
+ 719, 750, 781, 812, 843, 874, 905, 936, 967, 998, 999, 968, 937,
+ 906, 875, 844, 813, 782, 751, 720, 689, 658, 627, 596, 565, 534,
+ 503, 472, 441, 410, 379, 348, 317, 286, 255, 287, 318, 349, 380,
+ 411, 442, 473, 504, 535, 566, 597, 628, 659, 690, 721, 752, 783,
+ 814, 845, 876, 907, 938, 969, 1000, 1001, 970, 939, 908, 877, 846,
+ 815, 784, 753, 722, 691, 660, 629, 598, 567, 536, 505, 474, 443,
+ 412, 381, 350, 319, 351, 382, 413, 444, 475, 506, 537, 568, 599,
+ 630, 661, 692, 723, 754, 785, 816, 847, 878, 909, 940, 971, 1002,
+ 1003, 972, 941, 910, 879, 848, 817, 786, 755, 724, 693, 662, 631,
+ 600, 569, 538, 507, 476, 445, 414, 383, 415, 446, 477, 508, 539,
+ 570, 601, 632, 663, 694, 725, 756, 787, 818, 849, 880, 911, 942,
+ 973, 1004, 1005, 974, 943, 912, 881, 850, 819, 788, 757, 726, 695,
+ 664, 633, 602, 571, 540, 509, 478, 447, 479, 510, 541, 572, 603,
+ 634, 665, 696, 727, 758, 789, 820, 851, 882, 913, 944, 975, 1006,
+ 1007, 976, 945, 914, 883, 852, 821, 790, 759, 728, 697, 666, 635,
+ 604, 573, 542, 511, 543, 574, 605, 636, 667, 698, 729, 760, 791,
+ 822, 853, 884, 915, 946, 977, 1008, 1009, 978, 947, 916, 885, 854,
+ 823, 792, 761, 730, 699, 668, 637, 606, 575, 607, 638, 669, 700,
+ 731, 762, 793, 824, 855, 886, 917, 948, 979, 1010, 1011, 980, 949,
+ 918, 887, 856, 825, 794, 763, 732, 701, 670, 639, 671, 702, 733,
+ 764, 795, 826, 857, 888, 919, 950, 981, 1012, 1013, 982, 951, 920,
+ 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890,
+ 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767,
+ 799, 830, 861, 892, 923, 954, 985, 1016, 1017, 986, 955, 924, 893,
+ 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895,
+ 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023};
+
+constexpr uint16_t kDefaultScan4x16[64] = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+ 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+ 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63};
+
+constexpr uint16_t kColumnScan4x16[64] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+ 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+ 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63};
+
+constexpr uint16_t kRowScan4x16[64] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan16x4[64] = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35,
+ 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39,
+ 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+ 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63};
+
+constexpr uint16_t kColumnScan16x4[64] = {
+ 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51,
+ 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55,
+ 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+ 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
+
+constexpr uint16_t kRowScan16x4[64] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x32[256] = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
+ 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
+ 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23,
+ 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80,
+ 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89,
+ 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98,
+ 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107,
+ 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116,
+ 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+ 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+ 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+ 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+ 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+ 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+ 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+ 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+ 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+ 255};
+
+constexpr uint16_t kDefaultScan32x8[256] = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4,
+ 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193,
+ 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226,
+ 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10,
+ 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43,
+ 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76,
+ 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109,
+ 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142,
+ 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175,
+ 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208,
+ 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241,
+ 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25,
+ 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58,
+ 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91,
+ 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124,
+ 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126,
+ 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+ 255};
+
+// 5.11.41 (implemented as a simple look up of transform class and transform
+// size).
+const uint16_t* kScan[3][kNumTransformSizes] = {
+ // kTransformClass2D
+ {kDefaultScan4x4, kDefaultScan4x8, kDefaultScan4x16, kDefaultScan8x4,
+ kDefaultScan8x8, kDefaultScan8x16, kDefaultScan8x32, kDefaultScan16x4,
+ kDefaultScan16x8, kDefaultScan16x16, kDefaultScan16x32, kDefaultScan16x32,
+ kDefaultScan32x8, kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32,
+ kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+ // kTransformClassHorizontal
+ {kColumnScan4x4, kColumnScan4x8, kColumnScan4x16, kColumnScan8x4,
+ kColumnScan8x8, kColumnScan8x16, kColumnScan16x4, kColumnScan16x4,
+ kColumnScan16x8, kColumnScan16x16, kColumnScan16x4, kDefaultScan16x32,
+ kColumnScan16x4, kColumnScan16x4, kColumnScan16x4, kDefaultScan32x32,
+ kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+ // kTransformClassVertical
+ {kRowScan4x4, kRowScan4x8, kRowScan4x16, kRowScan8x4, kRowScan8x8,
+ kRowScan8x16, kRowScan16x4, kRowScan16x4, kRowScan16x8, kRowScan16x16,
+ kRowScan16x4, kDefaultScan16x32, kRowScan16x4, kRowScan16x4, kRowScan16x4,
+ kDefaultScan32x32, kDefaultScan32x16, kDefaultScan32x32,
+ kDefaultScan32x32}};
diff --git a/src/status_code.cc b/src/status_code.cc
new file mode 100644
index 0000000..34def08
--- /dev/null
+++ b/src/status_code.cc
@@ -0,0 +1,57 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/status_code.h"
+
+extern "C" {
+
+const char* Libgav1GetErrorString(Libgav1StatusCode status) {
+ switch (status) {
+ case kLibgav1StatusOk:
+ return "Success.";
+ case kLibgav1StatusUnknownError:
+ return "Unknown error.";
+ case kLibgav1StatusInvalidArgument:
+ return "Invalid function argument.";
+ case kLibgav1StatusOutOfMemory:
+ return "Memory allocation failure.";
+ case kLibgav1StatusResourceExhausted:
+ return "Ran out of a resource (other than memory).";
+ case kLibgav1StatusNotInitialized:
+ return "The object is not initialized.";
+ case kLibgav1StatusAlready:
+ return "An operation that can only be performed once has already been "
+ "performed.";
+ case kLibgav1StatusUnimplemented:
+ return "Not implemented.";
+ case kLibgav1StatusInternalError:
+ return "Internal error in libgav1.";
+ case kLibgav1StatusBitstreamError:
+ return "The bitstream is not encoded correctly or violates a bitstream "
+ "conformance requirement.";
+ case kLibgav1StatusTryAgain:
+ return "The operation is not allowed at the moment. Try again later.";
+ case kLibgav1StatusNothingToDequeue:
+ return "There are no enqueued frames, so there is nothing to dequeue. "
+ "Try enqueuing a frame before trying to dequeue again.";
+ // This switch statement does not have a default case. This way the compiler
+ // will warn if we neglect to update this function after adding a new value
+ // to the Libgav1StatusCode enum type.
+ case kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_:
+ break;
+ }
+ return "Unrecognized status code.";
+}
+
+} // extern "C"
diff --git a/src/symbol_decoder_context.cc b/src/symbol_decoder_context.cc
new file mode 100644
index 0000000..26a281e
--- /dev/null
+++ b/src/symbol_decoder_context.cc
@@ -0,0 +1,322 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/symbol_decoder_context.h"
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/symbol_decoder_context_cdfs.inc"
+
+uint8_t GetQuantizerContext(int base_quantizer_index) {
+ if (base_quantizer_index <= 20) return 0;
+ if (base_quantizer_index <= 60) return 1;
+ if (base_quantizer_index <= 120) return 2;
+ return 3;
+}
+
+// Reset*Counters() are helper functions to reset the CDF arrays where the
+// counters are not in the last element of the innermost dimension.
+
+void ResetPartitionCounters(SymbolDecoderContext* const context) {
+ int block_size_log2 = k4x4WidthLog2[kBlock8x8];
+ for (auto& d1 : context->partition_cdf) {
+ const int cdf_size =
+ SymbolDecoderContext::PartitionCdfSize(block_size_log2++);
+ for (auto& d2 : d1) {
+ d2[cdf_size] = 0;
+ }
+ }
+}
+
+void ResetPaletteColorIndexCounters(SymbolDecoderContext* const context) {
+ for (auto& d1 : context->palette_color_index_cdf) {
+ int cdf_size = kMinPaletteSize;
+ for (auto& d2 : d1) {
+ for (auto& d3 : d2) {
+ d3[cdf_size] = 0;
+ }
+ ++cdf_size;
+ }
+ }
+}
+
+void ResetTxTypeCounters(SymbolDecoderContext* const context) {
+ int set_index = kTransformSetIntra1;
+ for (auto& d1 : context->intra_tx_type_cdf) {
+ const int cdf_size = kNumTransformTypesInSet[set_index++];
+ for (auto& d2 : d1) {
+ for (auto& d3 : d2) {
+ d3[cdf_size] = 0;
+ }
+ }
+ }
+ for (auto& d1 : context->inter_tx_type_cdf) {
+ const int cdf_size = kNumTransformTypesInSet[set_index++];
+ for (auto& d2 : d1) {
+ d2[cdf_size] = 0;
+ }
+ }
+}
+
+void ResetTxDepthCounters(SymbolDecoderContext* const context) {
+ int delta = 1;
+ for (auto& d1 : context->tx_depth_cdf) {
+ const int cdf_size = kMaxTxDepthSymbolCount - delta;
+ delta = 0;
+ for (auto& d2 : d1) {
+ d2[cdf_size] = 0;
+ }
+ }
+}
+
+void ResetUVModeCounters(SymbolDecoderContext* const context) {
+ int cdf_size = kIntraPredictionModesUV - 1;
+ for (auto& d1 : context->uv_mode_cdf) {
+ for (auto& d2 : d1) {
+ d2[cdf_size] = 0;
+ }
+ ++cdf_size;
+ }
+}
+
+} // namespace
+
+#define CDF_COPY(source, destination) \
+ static_assert(sizeof(source) == sizeof(destination), ""); \
+ memcpy(destination, source, sizeof(source))
+
+void SymbolDecoderContext::Initialize(int base_quantizer_index) {
+ CDF_COPY(kDefaultPartitionCdf, partition_cdf);
+ CDF_COPY(kDefaultSkipCdf, skip_cdf);
+ CDF_COPY(kDefaultSkipModeCdf, skip_mode_cdf);
+ CDF_COPY(kDefaultSegmentIdCdf, segment_id_cdf);
+ CDF_COPY(kDefaultUsePredictedSegmentIdCdf, use_predicted_segment_id_cdf);
+ CDF_COPY(kDefaultDeltaQCdf, delta_q_cdf);
+ CDF_COPY(kDefaultDeltaQCdf, delta_lf_cdf);
+ for (auto& delta_lf_multi_cdf_entry : delta_lf_multi_cdf) {
+ CDF_COPY(kDefaultDeltaQCdf, delta_lf_multi_cdf_entry);
+ }
+ CDF_COPY(kDefaultIntraBlockCopyCdf, intra_block_copy_cdf);
+ CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+ CDF_COPY(kDefaultYModeCdf, y_mode_cdf);
+ CDF_COPY(kDefaultAngleDeltaCdf, angle_delta_cdf);
+ CDF_COPY(kDefaultUVModeCdf, uv_mode_cdf);
+ CDF_COPY(kDefaultCflAlphaSignsCdf, cfl_alpha_signs_cdf);
+ CDF_COPY(kDefaultCflAlphaCdf, cfl_alpha_cdf);
+ CDF_COPY(kDefaultUseFilterIntraCdf, use_filter_intra_cdf);
+ CDF_COPY(kDefaultFilterIntraModeCdf, filter_intra_mode_cdf);
+ CDF_COPY(kDefaultTxDepthCdf, tx_depth_cdf);
+ CDF_COPY(kDefaultTxSplitCdf, tx_split_cdf);
+ CDF_COPY(kDefaultInterTxTypeCdf, inter_tx_type_cdf);
+ CDF_COPY(kDefaultIntraTxTypeCdf, intra_tx_type_cdf);
+ CDF_COPY(kDefaultRestorationTypeCdf, restoration_type_cdf);
+ CDF_COPY(kDefaultUseWienerCdf, use_wiener_cdf);
+ CDF_COPY(kDefaultUseSgrProjCdf, use_sgrproj_cdf);
+ CDF_COPY(kDefaultHasPaletteYCdf, has_palette_y_cdf);
+ CDF_COPY(kDefaultPaletteYSizeCdf, palette_y_size_cdf);
+ CDF_COPY(kDefaultHasPaletteUVCdf, has_palette_uv_cdf);
+ CDF_COPY(kDefaultPaletteUVSizeCdf, palette_uv_size_cdf);
+ CDF_COPY(kDefaultPaletteColorIndexCdf, palette_color_index_cdf);
+ CDF_COPY(kDefaultIsInterCdf, is_inter_cdf);
+ CDF_COPY(kDefaultUseCompoundReferenceCdf, use_compound_reference_cdf);
+ CDF_COPY(kDefaultCompoundReferenceTypeCdf, compound_reference_type_cdf);
+ CDF_COPY(kDefaultCompoundReferenceCdf, compound_reference_cdf);
+ CDF_COPY(kDefaultCompoundBackwardReferenceCdf,
+ compound_backward_reference_cdf);
+ CDF_COPY(kDefaultSingleReferenceCdf, single_reference_cdf);
+ CDF_COPY(kDefaultCompoundPredictionModeCdf, compound_prediction_mode_cdf);
+ CDF_COPY(kDefaultNewMvCdf, new_mv_cdf);
+ CDF_COPY(kDefaultZeroMvCdf, zero_mv_cdf);
+ CDF_COPY(kDefaultReferenceMvCdf, reference_mv_cdf);
+ CDF_COPY(kDefaultRefMvIndexCdf, ref_mv_index_cdf);
+ CDF_COPY(kDefaultIsInterIntraCdf, is_inter_intra_cdf);
+ CDF_COPY(kDefaultInterIntraModeCdf, inter_intra_mode_cdf);
+ CDF_COPY(kDefaultIsWedgeInterIntraCdf, is_wedge_inter_intra_cdf);
+ CDF_COPY(kDefaultWedgeIndexCdf, wedge_index_cdf);
+ CDF_COPY(kDefaultUseObmcCdf, use_obmc_cdf);
+ CDF_COPY(kDefaultMotionModeCdf, motion_mode_cdf);
+ CDF_COPY(kDefaultIsExplicitCompoundTypeCdf, is_explicit_compound_type_cdf);
+ CDF_COPY(kDefaultIsCompoundTypeAverageCdf, is_compound_type_average_cdf);
+ CDF_COPY(kDefaultCompoundTypeCdf, compound_type_cdf);
+ CDF_COPY(kDefaultInterpolationFilterCdf, interpolation_filter_cdf);
+ for (int i = 0; i < kMvContexts; ++i) {
+ CDF_COPY(kDefaultMvJointCdf, mv_joint_cdf[i]);
+ for (int j = 0; j < kNumMvComponents; ++j) {
+ CDF_COPY(kDefaultMvSignCdf, mv_sign_cdf[i][j]);
+ CDF_COPY(kDefaultMvClassCdf, mv_class_cdf[i][j]);
+ CDF_COPY(kDefaultMvClass0BitCdf, mv_class0_bit_cdf[i][j]);
+ CDF_COPY(kDefaultMvClass0FractionCdf, mv_class0_fraction_cdf[i][j]);
+ CDF_COPY(kDefaultMvClass0HighPrecisionCdf,
+ mv_class0_high_precision_cdf[i][j]);
+ CDF_COPY(kDefaultMvBitCdf, mv_bit_cdf[i][j]);
+ CDF_COPY(kDefaultMvFractionCdf, mv_fraction_cdf[i][j]);
+ CDF_COPY(kDefaultMvHighPrecisionCdf, mv_high_precision_cdf[i][j]);
+ }
+ }
+ const int quantizer_context = GetQuantizerContext(base_quantizer_index);
+ CDF_COPY(kDefaultAllZeroCdf[quantizer_context], all_zero_cdf);
+ CDF_COPY(kDefaultEobPt16Cdf[quantizer_context], eob_pt_16_cdf);
+ CDF_COPY(kDefaultEobPt32Cdf[quantizer_context], eob_pt_32_cdf);
+ CDF_COPY(kDefaultEobPt64Cdf[quantizer_context], eob_pt_64_cdf);
+ CDF_COPY(kDefaultEobPt128Cdf[quantizer_context], eob_pt_128_cdf);
+ CDF_COPY(kDefaultEobPt256Cdf[quantizer_context], eob_pt_256_cdf);
+ CDF_COPY(kDefaultEobPt512Cdf[quantizer_context], eob_pt_512_cdf);
+ CDF_COPY(kDefaultEobPt1024Cdf[quantizer_context], eob_pt_1024_cdf);
+ CDF_COPY(kDefaultEobExtraCdf[quantizer_context], eob_extra_cdf);
+ CDF_COPY(kDefaultCoeffBaseEobCdf[quantizer_context], coeff_base_eob_cdf);
+ CDF_COPY(kDefaultCoeffBaseCdf[quantizer_context], coeff_base_cdf);
+ CDF_COPY(kDefaultCoeffBaseRangeCdf[quantizer_context], coeff_base_range_cdf);
+ CDF_COPY(kDefaultDcSignCdf[quantizer_context], dc_sign_cdf);
+}
+
+void SymbolDecoderContext::ResetIntraFrameYModeCdf() {
+ CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+}
+
+#undef CDF_COPY
+
+// These macros set the last element in the inner-most dimension of the array to
+// zero.
+#define RESET_COUNTER_1D(array) \
+ do { \
+ (array)[std::extent<decltype(array), 0>::value - 1] = 0; \
+ } while (false)
+
+#define RESET_COUNTER_2D(array) \
+ do { \
+ for (auto& d1 : (array)) { \
+ d1[std::extent<decltype(array), 1>::value - 1] = 0; \
+ } \
+ } while (false)
+
+#define RESET_COUNTER_3D(array) \
+ do { \
+ for (auto& d1 : (array)) { \
+ for (auto& d2 : d1) { \
+ d2[std::extent<decltype(array), 2>::value - 1] = 0; \
+ } \
+ } \
+ } while (false)
+
+#define RESET_COUNTER_4D(array) \
+ do { \
+ for (auto& d1 : (array)) { \
+ for (auto& d2 : d1) { \
+ for (auto& d3 : d2) { \
+ d3[std::extent<decltype(array), 3>::value - 1] = 0; \
+ } \
+ } \
+ } \
+ } while (false)
+
+void SymbolDecoderContext::ResetCounters() {
+ ResetPartitionCounters(this);
+ RESET_COUNTER_2D(segment_id_cdf);
+ RESET_COUNTER_2D(use_predicted_segment_id_cdf);
+ RESET_COUNTER_2D(skip_cdf);
+ RESET_COUNTER_2D(skip_mode_cdf);
+ RESET_COUNTER_1D(delta_q_cdf);
+ RESET_COUNTER_1D(delta_lf_cdf);
+ RESET_COUNTER_2D(delta_lf_multi_cdf);
+ RESET_COUNTER_1D(intra_block_copy_cdf);
+ RESET_COUNTER_3D(intra_frame_y_mode_cdf);
+ RESET_COUNTER_2D(y_mode_cdf);
+ RESET_COUNTER_2D(angle_delta_cdf);
+ ResetUVModeCounters(this);
+ RESET_COUNTER_1D(cfl_alpha_signs_cdf);
+ RESET_COUNTER_2D(cfl_alpha_cdf);
+ RESET_COUNTER_2D(use_filter_intra_cdf);
+ RESET_COUNTER_1D(filter_intra_mode_cdf);
+ ResetTxDepthCounters(this);
+ RESET_COUNTER_2D(tx_split_cdf);
+ RESET_COUNTER_3D(all_zero_cdf);
+ ResetTxTypeCounters(this);
+ RESET_COUNTER_3D(eob_pt_16_cdf);
+ RESET_COUNTER_3D(eob_pt_32_cdf);
+ RESET_COUNTER_3D(eob_pt_64_cdf);
+ RESET_COUNTER_3D(eob_pt_128_cdf);
+ RESET_COUNTER_3D(eob_pt_256_cdf);
+ RESET_COUNTER_2D(eob_pt_512_cdf);
+ RESET_COUNTER_2D(eob_pt_1024_cdf);
+ RESET_COUNTER_4D(eob_extra_cdf);
+ RESET_COUNTER_4D(coeff_base_eob_cdf);
+ RESET_COUNTER_4D(coeff_base_cdf);
+ RESET_COUNTER_4D(coeff_base_range_cdf);
+ RESET_COUNTER_3D(dc_sign_cdf);
+ RESET_COUNTER_1D(restoration_type_cdf);
+ RESET_COUNTER_1D(use_wiener_cdf);
+ RESET_COUNTER_1D(use_sgrproj_cdf);
+ RESET_COUNTER_3D(has_palette_y_cdf);
+ RESET_COUNTER_2D(palette_y_size_cdf);
+ RESET_COUNTER_2D(has_palette_uv_cdf);
+ RESET_COUNTER_2D(palette_uv_size_cdf);
+ ResetPaletteColorIndexCounters(this);
+ RESET_COUNTER_2D(is_inter_cdf);
+ RESET_COUNTER_2D(use_compound_reference_cdf);
+ RESET_COUNTER_2D(compound_reference_type_cdf);
+ RESET_COUNTER_4D(compound_reference_cdf);
+ RESET_COUNTER_3D(compound_backward_reference_cdf);
+ RESET_COUNTER_3D(single_reference_cdf);
+ RESET_COUNTER_2D(compound_prediction_mode_cdf);
+ RESET_COUNTER_2D(new_mv_cdf);
+ RESET_COUNTER_2D(zero_mv_cdf);
+ RESET_COUNTER_2D(reference_mv_cdf);
+ RESET_COUNTER_2D(ref_mv_index_cdf);
+ RESET_COUNTER_2D(is_inter_intra_cdf);
+ RESET_COUNTER_2D(inter_intra_mode_cdf);
+ RESET_COUNTER_2D(is_wedge_inter_intra_cdf);
+ RESET_COUNTER_2D(wedge_index_cdf);
+ RESET_COUNTER_2D(use_obmc_cdf);
+ RESET_COUNTER_2D(motion_mode_cdf);
+ RESET_COUNTER_2D(is_explicit_compound_type_cdf);
+ RESET_COUNTER_2D(is_compound_type_average_cdf);
+ RESET_COUNTER_2D(compound_type_cdf);
+ RESET_COUNTER_2D(interpolation_filter_cdf);
+ RESET_COUNTER_2D(mv_joint_cdf);
+ RESET_COUNTER_3D(mv_sign_cdf);
+ RESET_COUNTER_3D(mv_class_cdf);
+ RESET_COUNTER_3D(mv_class0_bit_cdf);
+ RESET_COUNTER_4D(mv_class0_fraction_cdf);
+ RESET_COUNTER_3D(mv_class0_high_precision_cdf);
+ RESET_COUNTER_4D(mv_bit_cdf);
+ RESET_COUNTER_3D(mv_fraction_cdf);
+ RESET_COUNTER_3D(mv_high_precision_cdf);
+}
+
+#undef RESET_COUNTER_1D
+#undef RESET_COUNTER_2D
+#undef RESET_COUNTER_3D
+#undef RESET_COUNTER_4D
+
+int SymbolDecoderContext::PartitionCdfSize(int block_size_log2) {
+ assert(block_size_log2 > 0);
+ assert(block_size_log2 < 6);
+
+ switch (block_size_log2) {
+ case 1:
+ return kPartitionSplit + 1;
+ case 5:
+ return kPartitionVerticalWithRightSplit + 1;
+ default:
+ return kMaxPartitionTypes;
+ }
+}
+
+} // namespace libgav1
diff --git a/src/symbol_decoder_context.h b/src/symbol_decoder_context.h
new file mode 100644
index 0000000..1bea76c
--- /dev/null
+++ b/src/symbol_decoder_context.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+#define LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum {
+ kPartitionContexts = 4,
+ kSegmentIdContexts = 3,
+ kUsePredictedSegmentIdContexts = 3,
+ kSkipContexts = 3,
+ kSkipModeContexts = 3,
+ kBooleanFieldCdfSize = 3,
+ kDeltaSymbolCount = 4, // Used for both delta_q and delta_lf.
+ kIntraModeContexts = 5,
+ kYModeContexts = 4,
+ kAngleDeltaSymbolCount = 2 * kMaxAngleDelta + 1,
+ kCflAlphaSignsSymbolCount = 8,
+ kCflAlphaContexts = 6,
+ kCflAlphaSymbolCount = 16,
+ kTxDepthContexts = 3,
+ kMaxTxDepthSymbolCount = 3,
+ kTxSplitContexts = 21,
+ kCoefficientQuantizerContexts = 4,
+ kNumSquareTransformSizes = 5,
+ kAllZeroContexts = 13,
+ kNumExtendedTransformSizes = 4,
+ kEobPtContexts = 2,
+ kEobPt16SymbolCount = 5,
+ kEobPt32SymbolCount = 6,
+ kEobPt64SymbolCount = 7,
+ kEobPt128SymbolCount = 8,
+ kEobPt256SymbolCount = 9,
+ kEobPt512SymbolCount = 10,
+ kEobPt1024SymbolCount = 11,
+ kEobExtraContexts = 9,
+ kCoeffBaseEobContexts = 4,
+ kCoeffBaseEobSymbolCount = 3,
+ kCoeffBaseContexts = 42,
+ kCoeffBaseSymbolCount = 4,
+ kCoeffBaseRangeContexts = 21,
+ kCoeffBaseRangeSymbolCount = 4,
+ kDcSignContexts = 3,
+ kPaletteBlockSizeContexts = 7,
+ kPaletteYModeContexts = 3,
+ kPaletteUVModeContexts = 2,
+ kPaletteSizeSymbolCount = 7,
+ kPaletteColorIndexContexts = 5,
+ kPaletteColorIndexSymbolCount = 8,
+ kIsInterContexts = 4,
+ kUseCompoundReferenceContexts = 5,
+ kCompoundReferenceTypeContexts = 5,
+ kReferenceContexts = 3,
+ kCompoundPredictionModeContexts = 8,
+ kNewMvContexts = 6,
+ kZeroMvContexts = 2,
+ kReferenceMvContexts = 6,
+ kRefMvIndexContexts = 3,
+ kInterIntraContexts = 3,
+ kWedgeIndexSymbolCount = 16,
+ kIsExplicitCompoundTypeContexts = 6,
+ kIsCompoundTypeAverageContexts = 6,
+ kInterpolationFilterContexts = 16,
+ kMvContexts = 2,
+ kMvClassSymbolCount = 11,
+ kMvFractionSymbolCount = 4,
+ kMvBitSymbolCount = 10,
+ kNumMvComponents = 2,
+}; // anonymous enum
+
+struct SymbolDecoderContext {
+ SymbolDecoderContext() = default;
+ explicit SymbolDecoderContext(int base_quantizer_index) {
+ Initialize(base_quantizer_index);
+ }
+
+ void Initialize(int base_quantizer_index);
+
+ // Partition related variables and functions.
+ static int PartitionCdfSize(int block_size_log2);
+
+ // Returns the cdf array index for inter_tx_type or intra_tx_type based on
+ // |tx_set|.
+ static int TxTypeIndex(TransformSet tx_set) {
+ assert(tx_set != kTransformSetDctOnly);
+ switch (tx_set) {
+ case kTransformSetInter1:
+ case kTransformSetIntra1:
+ return 0;
+ case kTransformSetInter2:
+ case kTransformSetIntra2:
+ return 1;
+ case kTransformSetInter3:
+ return 2;
+ default:
+ return -1;
+ }
+ }
+
+ // Resets the intra_frame_y_mode_cdf array to the default.
+ void ResetIntraFrameYModeCdf();
+
+ // Resets the symbol counters of all the CDF arrays to zero. Symbol counter is
+ // the last used element in the innermost dimension of each of the CDF array.
+ void ResetCounters();
+
+ // Note kMaxAlignment allows for aligned instructions to be used in the
+ // copies done in Initialize().
+ alignas(kMaxAlignment) uint16_t
+ partition_cdf[kBlockWidthCount][kPartitionContexts]
+ [kMaxPartitionTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
+ [kIntraPredictionModesY + 1];
+ alignas(kMaxAlignment) uint16_t
+ y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
+ alignas(kMaxAlignment) uint16_t
+ angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
+ [kIntraPredictionModesUV + 1];
+ alignas(kMaxAlignment) uint16_t
+ cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
+ alignas(kMaxAlignment) uint16_t
+ tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ inter_tx_type_cdf[3][kNumExtendedTransformSizes][kNumTransformTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ intra_tx_type_cdf[2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+ [kNumTransformTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt16SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt32SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt64SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt128SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt256SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes][kEobExtraContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseEobContexts][kCoeffBaseEobSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ restoration_type_cdf[kRestorationTypeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ palette_y_size_cdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ palette_uv_size_cdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
+ [kPaletteColorIndexContexts]
+ [kPaletteColorIndexSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ use_compound_reference_cdf[kUseCompoundReferenceContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_reference_type_cdf[kCompoundReferenceTypeContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_reference_cdf[kNumCompoundReferenceTypes][kReferenceContexts][3]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_backward_reference_cdf[kReferenceContexts][2]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
+ [kNumCompoundInterPredictionModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_type_cdf[kMaxBlockSizes]
+ [kNumExplicitCompoundPredictionTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ interpolation_filter_cdf[kInterpolationFilterContexts]
+ [kNumExplicitInterpolationFilters + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_bit_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_fraction_cdf[kMvContexts][kNumMvComponents][kBooleanSymbolCount]
+ [kMvFractionSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
+ [kMvFractionSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+ [kBooleanFieldCdfSize];
+};
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
diff --git a/src/symbol_decoder_context_cdfs.inc b/src/symbol_decoder_context_cdfs.inc
new file mode 100644
index 0000000..509286f
--- /dev/null
+++ b/src/symbol_decoder_context_cdfs.inc
@@ -0,0 +1,2509 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the CDF constant
+// definitions from the symbol decoder context functions.
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPartitionCdf
+ [kBlockWidthCount][kPartitionContexts][kMaxPartitionTypes + 1] = {
+ // width 8
+ {{13636, 7258, 2376, 0, 0},
+ {18840, 12913, 4228, 0, 0},
+ {20246, 9089, 4139, 0, 0},
+ {22872, 13985, 6915, 0, 0}},
+ // width 16
+ {{17171, 11839, 8197, 6062, 5104, 3947, 3167, 2197, 866, 0, 0},
+ {24843, 21725, 15983, 10298, 8797, 7725, 6117, 4067, 2934, 0, 0},
+ {27354, 19499, 17657, 12280, 10408, 8268, 7231, 6432, 651, 0, 0},
+ {30106, 26406, 24154, 11908, 9715, 7990, 6332, 4939, 1597, 0, 0}},
+ // width 32
+ {{14306, 11848, 9644, 5121, 4541, 3719, 3249, 2590, 1224, 0, 0},
+ {25079, 23708, 20712, 7776, 7108, 6586, 5817, 4727, 3716, 0, 0},
+ {26753, 23759, 22706, 8224, 7359, 6223, 5697, 5242, 721, 0, 0},
+ {31374, 30560, 29972, 4154, 3707, 3302, 2928, 2583, 869, 0, 0}},
+ // width 64
+ {{12631, 11221, 9690, 3202, 2931, 2507, 2244, 1876, 1044, 0, 0},
+ {26036, 25278, 23271, 4824, 4518, 4253, 3799, 3138, 2664, 0, 0},
+ {26823, 25105, 24420, 4085, 3651, 3019, 2704, 2470, 530, 0, 0},
+ {31898, 31556, 31281, 1570, 1374, 1194, 1025, 887, 436, 0, 0}},
+ // width 128
+ {{4869, 4549, 4239, 284, 229, 149, 129, 0, 0},
+ {26161, 25778, 24500, 708, 549, 430, 397, 0, 0},
+ {27339, 26092, 25646, 741, 541, 237, 186, 0, 0},
+ {32057, 31802, 31596, 320, 230, 151, 104, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] = {
+ {27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
+ {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
+ {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUsePredictedSegmentIdCdf[kUsePredictedSegmentIdContexts]
+ [kBooleanFieldCdfSize] = {{16384, 0, 0},
+ {16384, 0, 0},
+ {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
+ {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultSkipModeCdf[kSkipModeContexts][kBooleanFieldCdfSize] = {
+ {147, 0, 0}, {12060, 0, 0}, {24641, 0, 0}};
+
+// This constant is also used for DeltaLf and DeltaLfMulti.
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIntraFrameYModeCdf[kIntraModeContexts][kIntraModeContexts]
+ [kIntraPredictionModesY + 1] = {
+ {{17180, 15741, 13430, 12550, 12086, 11658,
+ 10943, 9524, 8579, 4603, 3675, 2302, 0, 0},
+ {20752, 14702, 13252, 12465, 12049, 11324,
+ 10880, 9736, 8334, 4110, 2596, 1359, 0, 0},
+ {22716, 21997, 10472, 9980, 9713, 9529, 8635,
+ 7148, 6608, 3432, 2839, 1201, 0, 0},
+ {18677, 17362, 16326, 13960, 13632, 13222,
+ 12770, 10672, 8022, 3183, 1810, 306, 0, 0},
+ {20646, 19503, 17165, 16267, 14159, 12735,
+ 10377, 7185, 6331, 2507, 1695, 293, 0, 0}},
+ {{22745, 13183, 11920, 11328, 10936, 10008,
+ 9679, 8745, 7387, 3754, 2286, 1332, 0, 0},
+ {26785, 8669, 8208, 7882, 7702, 6973, 6855,
+ 6345, 5158, 2863, 1492, 974, 0, 0},
+ {25324, 19987, 12591, 12040, 11691, 11161,
+ 10598, 9363, 8299, 4853, 3678, 2276, 0, 0},
+ {24231, 18079, 17336, 15681, 15360, 14596,
+ 14360, 12943, 8119, 3615, 1672, 558, 0, 0},
+ {25225, 18537, 17272, 16573, 14863, 12051,
+ 10784, 8252, 6767, 3093, 1787, 774, 0, 0}},
+ {{20155, 19177, 11385, 10764, 10456, 10191,
+ 9367, 7713, 7039, 3230, 2463, 691, 0, 0},
+ {23081, 19298, 14262, 13538, 13164, 12621,
+ 12073, 10706, 9549, 5025, 3557, 1861, 0, 0},
+ {26585, 26263, 6744, 6516, 6402, 6334, 5686,
+ 4414, 4213, 2301, 1974, 682, 0, 0},
+ {22050, 21034, 17814, 15544, 15203, 14844,
+ 14207, 11245, 8890, 3793, 2481, 516, 0, 0},
+ {23574, 22910, 16267, 15505, 14344, 13597,
+ 11205, 6807, 6207, 2696, 2031, 305, 0, 0}},
+ {{20166, 18369, 17280, 14387, 13990, 13453,
+ 13044, 11349, 7708, 3072, 1851, 359, 0, 0},
+ {24565, 18947, 18244, 15663, 15329, 14637,
+ 14364, 13300, 7543, 3283, 1610, 426, 0, 0},
+ {24317, 23037, 17764, 15125, 14756, 14343,
+ 13698, 11230, 8163, 3650, 2690, 750, 0, 0},
+ {25054, 23720, 23252, 16101, 15951, 15774,
+ 15615, 14001, 6025, 2379, 1232, 240, 0, 0},
+ {23925, 22488, 21272, 17451, 16116, 14825,
+ 13660, 10050, 6999, 2815, 1785, 283, 0, 0}},
+ {{20190, 19097, 16789, 15934, 13693, 11855,
+ 9779, 7319, 6549, 2554, 1618, 291, 0, 0},
+ {23205, 19142, 17688, 16876, 15012, 11905,
+ 10561, 8532, 7388, 3115, 1625, 491, 0, 0},
+ {24412, 23867, 15152, 14512, 13418, 12662,
+ 10170, 6821, 6302, 2868, 2245, 507, 0, 0},
+ {21933, 20953, 19644, 16726, 15750, 14729,
+ 13821, 10015, 8153, 3279, 1885, 286, 0, 0},
+ {25150, 24480, 22909, 22259, 17382, 14111,
+ 9865, 3992, 3588, 1413, 966, 175, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultYModeCdf[kYModeContexts][kIntraPredictionModesY + 1] = {
+ {9967, 9279, 8475, 8012, 7167, 6645, 6162, 5350, 4823, 3540, 3083, 2419,
+ 0, 0},
+ {14095, 12923, 10137, 9450, 8818, 8119, 7241, 5404, 4616, 3067, 2784,
+ 1916, 0, 0},
+ {12998, 11789, 9372, 8829, 8527, 8114, 7632, 5695, 4938, 3408, 3038,
+ 2109, 0, 0},
+ {12613, 11467, 9930, 9590, 9507, 9235, 9065, 7964, 7416, 6193, 5752,
+ 4719, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultAngleDeltaCdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1] =
+ {{30588, 27736, 25201, 9992, 5779, 2551, 0, 0},
+ {30467, 27160, 23967, 9281, 5794, 2438, 0, 0},
+ {28988, 21750, 19069, 13414, 9685, 1482, 0, 0},
+ {28187, 21542, 17621, 15630, 10934, 4371, 0, 0},
+ {31031, 21841, 18259, 13180, 10023, 3945, 0, 0},
+ {30104, 22592, 20283, 15118, 11168, 2273, 0, 0},
+ {30528, 21672, 17315, 12427, 10207, 3851, 0, 0},
+ {29163, 22340, 20309, 15092, 11524, 2113, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUVModeCdf[kBooleanSymbolCount][kIntraPredictionModesY]
+ [kIntraPredictionModesUV + 1] = {
+ // CFL not allowed.
+ {{10137, 8616, 7390, 7107, 6782, 6248, 5713, 4845,
+ 4524, 2709, 1827, 807, 0, 0},
+ {23255, 5887, 5795, 5722, 5650, 5104, 5029, 4944,
+ 4409, 3263, 2968, 972, 0, 0},
+ {22923, 22853, 4105, 4064, 4011, 3988, 3570, 2946,
+ 2914, 2004, 991, 739, 0, 0},
+ {19129, 18871, 18597, 7437, 7162, 7041, 6815, 5620,
+ 4191, 2156, 1413, 275, 0, 0},
+ {23004, 22933, 22838, 22814, 7382, 5715, 4810, 4620,
+ 4525, 1667, 1024, 405, 0, 0},
+ {20943, 19179, 19091, 19048, 17720, 3555, 3467, 3310,
+ 3057, 1607, 1327, 218, 0, 0},
+ {18593, 18369, 16160, 15947, 15050, 14993, 4217, 2568,
+ 2523, 931, 426, 101, 0, 0},
+ {19883, 19730, 17790, 17178, 17095, 17020, 16592,
+ 3640, 3501, 2125, 807, 307, 0, 0},
+ {20742, 19107, 18894, 17463, 17278, 17042, 16773,
+ 16495, 4325, 2380, 2001, 352, 0, 0},
+ {13716, 12928, 12189, 11852, 11618, 11301, 10883,
+ 10049, 9594, 3907, 2389, 593, 0, 0},
+ {14141, 13119, 11794, 11549, 11276, 10952, 10569,
+ 9649, 9241, 5715, 1371, 620, 0, 0},
+ {15742, 13764, 12771, 12429, 12182, 11665, 11419,
+ 10861, 10286, 6872, 6227, 949, 0, 0},
+ {20644, 19009, 17809, 17776, 17761, 17717, 17690,
+ 17602, 17513, 17015, 16729, 16162, 0, 0}},
+ // CFL allowed.
+ {{22361, 21560, 19868, 19587, 18945, 18593, 17869,
+ 17112, 16782, 12682, 11773, 10313, 8556, 0, 0},
+ {28236, 12988, 12711, 12553, 12340, 11697, 11569,
+ 11317, 10669, 8540, 8075, 5736, 3296, 0, 0},
+ {27495, 27389, 12591, 12498, 12383, 12329, 11819,
+ 11073, 10994, 9630, 8512, 8065, 6089, 0, 0},
+ {26028, 25601, 25106, 18616, 18232, 17983, 17734,
+ 16027, 14397, 11248, 10562, 9379, 8586, 0, 0},
+ {27781, 27400, 26840, 26700, 13654, 12453, 10911,
+ 10515, 10357, 7857, 7388, 6741, 6392, 0, 0},
+ {27398, 25879, 25521, 25375, 23270, 11654, 11366,
+ 11015, 10787, 7988, 7382, 6251, 5592, 0, 0},
+ {27952, 27807, 25564, 25442, 24003, 23838, 12599,
+ 12086, 11965, 9580, 9005, 8313, 7828, 0, 0},
+ {26160, 26028, 24239, 23719, 23511, 23412, 23033,
+ 13941, 13709, 10432, 9564, 8804, 7975, 0, 0},
+ {26770, 25349, 24987, 23835, 23513, 23219, 23015,
+ 22351, 13870, 10274, 9629, 8004, 6779, 0, 0},
+ {22108, 21470, 20218, 19811, 19446, 19144, 18728,
+ 17764, 17234, 12054, 10979, 9325, 7907, 0, 0},
+ {22246, 21238, 20216, 19805, 19390, 18989, 18523,
+ 17533, 16866, 12666, 10072, 8994, 6930, 0, 0},
+ {22669, 22077, 20129, 19719, 19382, 19103, 18643,
+ 17605, 17132, 13092, 12294, 9249, 7560, 0, 0},
+ {29624, 27681, 25386, 25264, 25175, 25078, 24967,
+ 24704, 24536, 23520, 22893, 22247, 3720, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
+ 31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1] = {
+ {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44,
+ 0, 0},
+ {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88,
+ 84, 0, 0},
+ {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75,
+ 71, 0, 0},
+ {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0,
+ 0},
+ {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192,
+ 175, 146, 0, 0},
+ {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174,
+ 146, 112, 108, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseFilterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+ {28147, 0, 0}, {26025, 0, 0}, {19998, 0, 0}, {26875, 0, 0},
+ {24902, 0, 0}, {20217, 0, 0}, {12539, 0, 0}, {22400, 0, 0},
+ {23374, 0, 0}, {20360, 0, 0}, {18467, 0, 0}, {16384, 0, 0},
+ {14667, 0, 0}, {20012, 0, 0}, {10425, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
+ 23819, 19992, 15557, 3210, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultTxDepthCdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1] = {
+ {{12800, 0, 0}, {12800, 0, 0}, {8448, 0, 0}},
+ {{20496, 2596, 0, 0}, {20496, 2596, 0, 0}, {14091, 1920, 0, 0}},
+ {{19782, 17588, 0, 0}, {19782, 17588, 0, 0}, {8466, 7166, 0, 0}},
+ {{26986, 21293, 0, 0}, {26986, 21293, 0, 0}, {15965, 10009, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] = {
+ {4187, 0, 0}, {8922, 0, 0}, {11921, 0, 0}, {8453, 0, 0},
+ {14572, 0, 0}, {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0},
+ {21763, 0, 0}, {5589, 0, 0}, {12764, 0, 0}, {21487, 0, 0},
+ {6219, 0, 0}, {13460, 0, 0}, {18544, 0, 0}, {4753, 0, 0},
+ {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0}, {10367, 0, 0},
+ {16680, 0, 0}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
+ [kNumSquareTransformSizes][kAllZeroContexts]
+ [kBooleanFieldCdfSize] = {
+ {
+ {{919, 0, 0}, {26876, 0, 0}, {20656, 0, 0}, {10833, 0, 0}, {12479, 0, 0},
+ {5295, 0, 0}, {281, 0, 0}, {25114, 0, 0}, {13295, 0, 0}, {2784, 0, 0},
+ {22807, 0, 0}, {2526, 0, 0}, {651, 0, 0}},
+ {{1220, 0, 0}, {31219, 0, 0}, {22638, 0, 0}, {16112, 0, 0}, {14177, 0, 0},
+ {6460, 0, 0}, {231, 0, 0}, {27365, 0, 0}, {14672, 0, 0}, {2765, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{2811, 0, 0}, {27377, 0, 0}, {14729, 0, 0}, {9202, 0, 0}, {10337, 0, 0},
+ {6946, 0, 0}, {571, 0, 0}, {28990, 0, 0}, {17432, 0, 0}, {3787, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{14848, 0, 0}, {30950, 0, 0}, {25486, 0, 0}, {7495, 0, 0}, {21845, 0, 0},
+ {1214, 0, 0}, {144, 0, 0}, {31402, 0, 0}, {17140, 0, 0}, {2306, 0, 0},
+ {32622, 0, 0}, {27636, 0, 0}, {1111, 0, 0}},
+ {{26460, 0, 0}, {32651, 0, 0}, {31130, 0, 0}, {30607, 0, 0}, {16384, 0, 0},
+ {21845, 0, 0}, {2521, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+ },
+ {
+ {{2397, 0, 0}, {25198, 0, 0}, {19613, 0, 0}, {12017, 0, 0}, {11799, 0, 0},
+ {5701, 0, 0}, {755, 0, 0}, {27273, 0, 0}, {14826, 0, 0}, {4488, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{986, 0, 0}, {30932, 0, 0}, {22079, 0, 0}, {15164, 0, 0}, {11146, 0, 0},
+ {5250, 0, 0}, {369, 0, 0}, {28349, 0, 0}, {16474, 0, 0}, {4423, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{867, 0, 0}, {22457, 0, 0}, {14721, 0, 0}, {7962, 0, 0}, {9480, 0, 0},
+ {4854, 0, 0}, {472, 0, 0}, {28553, 0, 0}, {17012, 0, 0}, {4427, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{6042, 0, 0}, {31723, 0, 0}, {21065, 0, 0}, {12178, 0, 0}, {14214, 0, 0},
+ {6798, 0, 0}, {830, 0, 0}, {27185, 0, 0}, {11455, 0, 0}, {3378, 0, 0},
+ {32127, 0, 0}, {10503, 0, 0}, {1316, 0, 0}},
+ {{6184, 0, 0}, {32580, 0, 0}, {23921, 0, 0}, {8249, 0, 0}, {9830, 0, 0},
+ {2185, 0, 0}, {160, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+ },
+ {
+ {{3154, 0, 0}, {23700, 0, 0}, {19844, 0, 0}, {13230, 0, 0}, {15031, 0, 0},
+ {8149, 0, 0}, {2126, 0, 0}, {28649, 0, 0}, {16742, 0, 0}, {7111, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{811, 0, 0}, {29538, 0, 0}, {21615, 0, 0}, {14645, 0, 0}, {12625, 0, 0},
+ {6232, 0, 0}, {782, 0, 0}, {29718, 0, 0}, {18165, 0, 0}, {7613, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{405, 0, 0}, {22076, 0, 0}, {13678, 0, 0}, {8411, 0, 0}, {8326, 0, 0},
+ {4456, 0, 0}, {599, 0, 0}, {29120, 0, 0}, {17078, 0, 0}, {5953, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{2099, 0, 0}, {28936, 0, 0}, {21105, 0, 0}, {13879, 0, 0}, {12986, 0, 0},
+ {9455, 0, 0}, {1438, 0, 0}, {27644, 0, 0}, {14049, 0, 0}, {4300, 0, 0},
+ {29686, 0, 0}, {11786, 0, 0}, {3325, 0, 0}},
+ {{4195, 0, 0}, {29585, 0, 0}, {14966, 0, 0}, {6791, 0, 0}, {6091, 0, 0},
+ {4936, 0, 0}, {381, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+ },
+ {
+ {{5881, 0, 0}, {26039, 0, 0}, {22407, 0, 0}, {15326, 0, 0}, {17723, 0, 0},
+ {10290, 0, 0}, {3696, 0, 0}, {30055, 0, 0}, {20907, 0, 0}, {11995, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{865, 0, 0}, {30724, 0, 0}, {25240, 0, 0}, {18150, 0, 0}, {16586, 0, 0},
+ {8600, 0, 0}, {1731, 0, 0}, {29982, 0, 0}, {21574, 0, 0}, {12613, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{258, 0, 0}, {24338, 0, 0}, {15450, 0, 0}, {8614, 0, 0}, {9094, 0, 0},
+ {3979, 0, 0}, {629, 0, 0}, {29328, 0, 0}, {19651, 0, 0}, {10066, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{1097, 0, 0}, {30712, 0, 0}, {21022, 0, 0}, {15916, 0, 0}, {14133, 0, 0},
+ {8053, 0, 0}, {1284, 0, 0}, {28112, 0, 0}, {16694, 0, 0}, {8064, 0, 0},
+ {30962, 0, 0}, {18123, 0, 0}, {7432, 0, 0}},
+ {{1229, 0, 0}, {24335, 0, 0}, {12192, 0, 0}, {4864, 0, 0}, {4916, 0, 0},
+ {2742, 0, 0}, {327, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+ }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultInterTxTypeCdf[3][kNumExtendedTransformSizes][kNumTransformTypes +
+ 1] = {
+ {{28310, 27208, 25073, 23059, 19438, 17979, 15231, 12502, 11264, 9920,
+ 8834, 7294, 5041, 3853, 2137, 0, 0},
+ {31123, 30195, 27990, 27057, 24961, 24146, 22246, 17411, 15094, 12360,
+ 10251, 7758, 5652, 3912, 2019, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0}},
+ {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ // Only 16x16 is used in this case.
+ {31998, 30347, 27543, 19861, 16949, 13841, 11207, 8679, 6173, 4242,
+ 2239, 0},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+ {{16384, 0, 0}, {28601, 0, 0}, {30770, 0, 0}, {32020, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultIntraTxTypeCdf
+ [2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+ [kNumTransformTypes + 1] = {
+ {{{31233, 24733, 23307, 20017, 9301, 4943, 0, 0},
+ {32204, 29433, 23059, 21898, 14625, 4674, 0, 0},
+ {32096, 29521, 29092, 20786, 13353, 9641, 0, 0},
+ {27489, 18883, 17281, 14724, 9241, 2516, 0, 0},
+ {28345, 26694, 24783, 22352, 7075, 3470, 0, 0},
+ {31282, 28527, 23308, 22106, 16312, 5074, 0, 0},
+ {32329, 29930, 29246, 26031, 14710, 9014, 0, 0},
+ {31578, 28535, 27913, 21098, 12487, 8391, 0, 0},
+ {31723, 28456, 24121, 22609, 14124, 3433, 0, 0},
+ {32566, 29034, 28021, 25470, 15641, 8752, 0, 0},
+ {32321, 28456, 25949, 23884, 16758, 8910, 0, 0},
+ {32491, 28399, 27513, 23863, 16303, 10497, 0, 0},
+ {29359, 27332, 22169, 17169, 13081, 8728, 0, 0}},
+ {{30898, 19026, 18238, 16270, 8998, 5070, 0, 0},
+ {32442, 23972, 18136, 17689, 13496, 5282, 0, 0},
+ {32284, 25192, 25056, 18325, 13609, 10177, 0, 0},
+ {31642, 17428, 16873, 15745, 11872, 2489, 0, 0},
+ {32113, 27914, 27519, 26855, 10669, 5630, 0, 0},
+ {31469, 26310, 23883, 23478, 17917, 7271, 0, 0},
+ {32457, 27473, 27216, 25883, 16661, 10096, 0, 0},
+ {31885, 24709, 24498, 21510, 15479, 11219, 0, 0},
+ {32027, 25188, 23450, 22423, 16080, 3722, 0, 0},
+ {32658, 25362, 24853, 23573, 16727, 9439, 0, 0},
+ {32405, 24794, 23411, 22095, 17139, 8294, 0, 0},
+ {32615, 25121, 24656, 22832, 17461, 12772, 0, 0},
+ {29257, 26436, 21603, 17433, 13445, 9174, 0, 0}}},
+ {{{26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0}},
+ {{26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0}},
+ {{31641, 19954, 9996, 5285, 0, 0},
+ {32623, 26007, 20788, 6101, 0, 0},
+ {32406, 26881, 21090, 16043, 0, 0},
+ {32383, 17555, 14181, 2075, 0, 0},
+ {32743, 29854, 9634, 4865, 0, 0},
+ {32708, 28298, 21019, 8777, 0, 0},
+ {32731, 29436, 18257, 11320, 0, 0},
+ {32611, 26448, 19732, 15329, 0, 0},
+ {32649, 26049, 19862, 3372, 0, 0},
+ {32721, 27231, 20192, 11269, 0, 0},
+ {32499, 26692, 21510, 9653, 0, 0},
+ {32685, 27153, 20767, 15540, 0, 0},
+ {30800, 27212, 20745, 14221, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt16Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPtContexts][kEobPt16SymbolCount + 1] = {
+ {{{31928, 31729, 30788, 27873, 0, 0},
+ {32398, 32097, 30885, 28297, 0, 0}},
+ {{29521, 27818, 23080, 18205, 0, 0},
+ {30864, 29414, 25005, 18121, 0, 0}}},
+ {{{30643, 30217, 27603, 23822, 0, 0},
+ {32255, 32003, 30909, 26429, 0, 0}},
+ {{25131, 23270, 18509, 13660, 0, 0},
+ {30271, 28672, 23902, 15775, 0, 0}}},
+ {{{28752, 27871, 23887, 17800, 0, 0},
+ {32052, 31663, 30122, 22712, 0, 0}},
+ {{21629, 19498, 14527, 9202, 0, 0},
+ {29576, 27736, 22471, 13013, 0, 0}}},
+ {{{26060, 23810, 18022, 10635, 0, 0},
+ {31546, 30694, 27985, 17358, 0, 0}},
+ {{13193, 11002, 6724, 3059, 0, 0},
+ {25471, 22001, 13495, 4574, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt32Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPtContexts][kEobPt32SymbolCount + 1] = {
+ {{{32368, 32248, 31791, 30666, 26226, 0, 0},
+ {32558, 32363, 31453, 29442, 25231, 0, 0}},
+ {{30132, 28495, 25180, 20974, 12367, 0, 0},
+ {30982, 29589, 25866, 21411, 13714, 0, 0}}},
+ {{{31779, 31519, 30749, 28617, 21983, 0, 0},
+ {32455, 32327, 31669, 29851, 24206, 0, 0}},
+ {{24374, 22416, 18836, 13913, 6754, 0, 0},
+ {30190, 28644, 24587, 19098, 8534, 0, 0}}},
+ {{{30253, 29765, 28316, 24606, 16727, 0, 0},
+ {32194, 31947, 30932, 27679, 19640, 0, 0}},
+ {{19300, 16465, 12407, 7663, 3487, 0, 0},
+ {29226, 27266, 22353, 16008, 7124, 0, 0}}},
+ {{{28151, 27059, 24322, 19184, 9633, 0, 0},
+ {31612, 31066, 29093, 23494, 12229, 0, 0}},
+ {{10682, 8486, 5758, 2998, 1025, 0, 0},
+ {25069, 21871, 11877, 5842, 1140, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt64Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPtContexts][kEobPt64SymbolCount + 1] = {
+ {{{32439, 32270, 31667, 30984, 29503, 25010, 0, 0},
+ {32433, 32038, 31309, 27274, 24013, 19771, 0, 0}},
+ {{29263, 27464, 22682, 18954, 15084, 9398, 0, 0},
+ {31205, 30068, 27892, 21857, 18062, 10288, 0, 0}}},
+ {{{31508, 31322, 30515, 29056, 26116, 19399, 0, 0},
+ {32367, 32163, 31739, 30205, 26923, 20142, 0, 0}},
+ {{24159, 22156, 18144, 14054, 10154, 3744, 0, 0},
+ {30845, 29641, 26901, 23065, 18491, 5668, 0, 0}}},
+ {{{30394, 29996, 28185, 25492, 20480, 13062, 0, 0},
+ {32271, 31958, 31453, 29768, 25764, 17127, 0, 0}},
+ {{17718, 15642, 11358, 7882, 4612, 2042, 0, 0},
+ {28734, 26478, 22533, 17786, 11554, 4277, 0, 0}}},
+ {{{26461, 25227, 20708, 16410, 10215, 4903, 0, 0},
+ {31479, 30448, 28797, 24842, 18615, 8477, 0, 0}},
+ {{8556, 7060, 4500, 2733, 1461, 719, 0, 0},
+ {24042, 20390, 13359, 6318, 2730, 306, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt128Cdf
+ [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+ [kEobPt128SymbolCount + 1] = {
+ {{{32549, 32286, 31628, 30677, 29088, 26740, 20182, 0, 0},
+ {32397, 32069, 31514, 27938, 23289, 20206, 15271, 0, 0}},
+ {{27523, 25312, 19888, 16916, 12735, 8836, 5160, 0, 0},
+ {30714, 29296, 26899, 18536, 14526, 12178, 6016, 0, 0}}},
+ {{{32083, 31835, 31280, 30054, 28002, 24206, 13514, 0, 0},
+ {32551, 32416, 32150, 30465, 27507, 22799, 15296, 0, 0}},
+ {{24723, 21568, 17271, 13173, 8820, 5360, 1830, 0, 0},
+ {30458, 28608, 25297, 17771, 14837, 12000, 2528, 0, 0}}},
+ {{{31402, 31030, 30241, 27752, 23413, 16971, 8125, 0, 0},
+ {32414, 32210, 31824, 30008, 25481, 18731, 10989, 0, 0}},
+ {{19141, 16522, 12595, 8339, 4820, 2353, 905, 0, 0},
+ {26493, 22879, 17999, 9604, 4780, 2275, 496, 0, 0}}},
+ {{{29296, 27883, 25279, 20287, 14251, 8232, 3133, 0, 0},
+ {31882, 31037, 29497, 24299, 17199, 10642, 4385, 0, 0}},
+ {{8455, 6706, 4383, 2661, 1551, 870, 423, 0, 0},
+ {23603, 19486, 11618, 2482, 874, 197, 56, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt256Cdf
+ [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+ [kEobPt256SymbolCount + 1] = {
+ {{{32458, 32184, 30881, 29179, 26600, 24157, 21416, 17116, 0, 0},
+ {31770, 30918, 29770, 27164, 15427, 12880, 9869, 7185, 0, 0}},
+ {{30248, 29528, 26816, 23898, 20191, 15210, 12814, 8600, 0, 0},
+ {30565, 28638, 25333, 22029, 12116, 9087, 7159, 5507, 0, 0}}},
+ {{{31320, 30659, 28617, 26505, 23439, 19508, 14824, 9468, 0, 0},
+ {32369, 31749, 31019, 29730, 22324, 17222, 10029, 5474, 0, 0}},
+ {{26366, 24620, 20145, 17696, 14040, 9921, 6321, 3391, 0, 0},
+ {31094, 29516, 27034, 22609, 10371, 8966, 7947, 1828, 0, 0}}},
+ {{{29679, 28848, 26730, 23308, 18502, 12887, 7002, 3592, 0, 0},
+ {31684, 30410, 29280, 27646, 21285, 14665, 6745, 2969, 0, 0}},
+ {{21254, 18974, 15288, 12014, 8407, 5390, 3276, 1491, 0, 0},
+ {26197, 23158, 17252, 10942, 3676, 1939, 926, 60, 0, 0}}},
+ {{{27420, 25655, 20948, 16844, 10662, 5991, 2434, 1011, 0, 0},
+ {30315, 28294, 26461, 23991, 16294, 9793, 3768, 1221, 0, 0}},
+ {{9658, 8171, 5628, 3874, 2601, 1841, 1376, 674, 0, 0},
+ {22770, 15107, 7590, 4671, 1460, 730, 365, 73, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt512Cdf
+ [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPt512SymbolCount + 1] =
+ {{{32127, 31785, 29061, 27338, 22534, 17810, 13980, 9356, 6707, 0, 0},
+ {27673, 26322, 22772, 19414, 16751, 14782, 11849, 6639, 3628, 0, 0}},
+ {{31538, 30490, 27733, 24992, 20897, 17422, 13178, 8184, 4019, 0, 0},
+ {25503, 22789, 16949, 13518, 10988, 8922, 6290, 4372, 957, 0, 0}},
+ {{30144, 28832, 26288, 23082, 18789, 15042, 9501, 4358, 1690, 0, 0},
+ {20753, 17999, 13180, 10716, 8546, 6956, 5468, 3549, 654, 0, 0}},
+ {{26841, 24959, 21845, 18171, 13329, 8633, 4312, 1626, 708, 0, 0},
+ {11675, 9725, 7026, 5110, 3671, 3052, 2695, 1948, 812, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt1024Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPt1024SymbolCount + 1] = {
+ {{32375, 32347, 32017, 31145, 29608, 26416, 19423,
+ 14721, 10197, 6938, 0, 0},
+ {30903, 30780, 29838, 28526, 22235, 16230, 11414,
+ 5513, 4222, 984, 0, 0}},
+ {{32072, 31820, 29623, 27066, 23062, 19551, 14917,
+ 10912, 7076, 4734, 0, 0},
+ {30096, 29177, 23438, 15684, 10043, 8484, 6241,
+ 4741, 4391, 1892, 0, 0}},
+ {{29984, 28937, 25727, 22247, 17921, 13924, 9613,
+ 6086, 3539, 1723, 0, 0},
+ {23191, 20302, 15029, 12018, 10707, 9553, 8167,
+ 7285, 6925, 712, 0, 0}},
+ {{26070, 24434, 20807, 17006, 12582, 8906, 5334,
+ 3442, 1686, 718, 0, 0},
+ {12199, 10342, 7199, 5909, 4715, 3855, 3282, 3044,
+ 2961, 198, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
+ [kNumSquareTransformSizes][kNumPlaneTypes]
+ [kEobExtraContexts][kBooleanFieldCdfSize] = {
+ {
+ {
+ {{15807, 0, 0}, {15545, 0, 0}, {25147, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{13699, 0, 0}, {10243, 0, 0}, {19391, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12367, 0, 0}, {15743, 0, 0}, {19923, 0, 0}, {19895, 0, 0},
+ {18674, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12087, 0, 0}, {12067, 0, 0}, {17518, 0, 0}, {17751, 0, 0},
+ {17840, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{8863, 0, 0}, {15574, 0, 0}, {16598, 0, 0}, {15073, 0, 0},
+ {18942, 0, 0}, {16958, 0, 0}, {20732, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{8809, 0, 0}, {11969, 0, 0}, {13747, 0, 0}, {16565, 0, 0},
+ {14882, 0, 0}, {18624, 0, 0}, {20758, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{5369, 0, 0}, {16441, 0, 0}, {14697, 0, 0}, {13184, 0, 0},
+ {12047, 0, 0}, {14336, 0, 0}, {13208, 0, 0}, {22618, 0, 0},
+ {23963, 0, 0}},
+ {{7836, 0, 0}, {11935, 0, 0}, {20741, 0, 0}, {16098, 0, 0},
+ {12854, 0, 0}, {17662, 0, 0}, {15106, 0, 0}, {18985, 0, 0},
+ {4012, 0, 0}}
+ },
+ {
+ {{9362, 0, 0}, {10923, 0, 0}, {14336, 0, 0}, {16384, 0, 0},
+ {15672, 0, 0}, {20207, 0, 0}, {15448, 0, 0}, {10373, 0, 0},
+ {11398, 0, 0}},
+ {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ }
+ },
+ {
+ {
+ {{15297, 0, 0}, {12545, 0, 0}, {21411, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12433, 0, 0}, {11101, 0, 0}, {17950, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12338, 0, 0}, {12106, 0, 0}, {17401, 0, 0}, {15798, 0, 0},
+ {18111, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{10651, 0, 0}, {10740, 0, 0}, {14118, 0, 0}, {16726, 0, 0},
+ {16883, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{10359, 0, 0}, {11756, 0, 0}, {17118, 0, 0}, {15373, 0, 0},
+ {17299, 0, 0}, {12563, 0, 0}, {13257, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{8548, 0, 0}, {10288, 0, 0}, {15031, 0, 0}, {13852, 0, 0},
+ {13500, 0, 0}, {14356, 0, 0}, {13924, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{6777, 0, 0}, {12454, 0, 0}, {15037, 0, 0}, {13090, 0, 0},
+ {14119, 0, 0}, {15461, 0, 0}, {10970, 0, 0}, {15219, 0, 0},
+ {17138, 0, 0}},
+ {{6183, 0, 0}, {11299, 0, 0}, {12336, 0, 0}, {15033, 0, 0},
+ {13488, 0, 0}, {17533, 0, 0}, {12471, 0, 0}, {10297, 0, 0},
+ {3771, 0, 0}}
+ },
+ {
+ {{6163, 0, 0}, {21464, 0, 0}, {16042, 0, 0}, {16208, 0, 0},
+ {11902, 0, 0}, {9244, 0, 0}, {12890, 0, 0}, {19299, 0, 0},
+ {9684, 0, 0}},
+ {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ }
+ },
+ {
+ {
+ {{13785, 0, 0}, {12256, 0, 0}, {17883, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12678, 0, 0}, {13324, 0, 0}, {15482, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{13629, 0, 0}, {11281, 0, 0}, {13809, 0, 0}, {11858, 0, 0},
+ {13679, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12232, 0, 0}, {12104, 0, 0}, {12143, 0, 0}, {13645, 0, 0},
+ {17906, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12935, 0, 0}, {11266, 0, 0}, {15283, 0, 0}, {12501, 0, 0},
+ {14415, 0, 0}, {9439, 0, 0}, {11290, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{10727, 0, 0}, {9334, 0, 0}, {12767, 0, 0}, {12214, 0, 0},
+ {11817, 0, 0}, {12623, 0, 0}, {17206, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{9456, 0, 0}, {11161, 0, 0}, {16242, 0, 0}, {13811, 0, 0},
+ {14734, 0, 0}, {13834, 0, 0}, {8521, 0, 0}, {15847, 0, 0},
+ {15688, 0, 0}},
+ {{6189, 0, 0}, {7858, 0, 0}, {14131, 0, 0}, {12968, 0, 0},
+ {12380, 0, 0}, {22881, 0, 0}, {17126, 0, 0}, {2570, 0, 0},
+ {8047, 0, 0}}
+ },
+ {
+ {{5770, 0, 0}, {16031, 0, 0}, {14930, 0, 0}, {13846, 0, 0},
+ {13253, 0, 0}, {14132, 0, 0}, {15435, 0, 0}, {16992, 0, 0},
+ {10110, 0, 0}},
+ {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ }
+ },
+ {
+ {
+ {{12591, 0, 0}, {11979, 0, 0}, {12506, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{11352, 0, 0}, {11913, 0, 0}, {9358, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12530, 0, 0}, {11711, 0, 0}, {13609, 0, 0}, {10431, 0, 0},
+ {12609, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12643, 0, 0}, {12209, 0, 0}, {11061, 0, 0}, {10472, 0, 0},
+ {15435, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12827, 0, 0}, {12241, 0, 0}, {11298, 0, 0}, {10281, 0, 0},
+ {13210, 0, 0}, {10414, 0, 0}, {12437, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{10016, 0, 0}, {7762, 0, 0}, {10693, 0, 0}, {11192, 0, 0},
+ {15028, 0, 0}, {11078, 0, 0}, {13557, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{11326, 0, 0}, {10410, 0, 0}, {14265, 0, 0}, {12477, 0, 0},
+ {12823, 0, 0}, {11474, 0, 0}, {11590, 0, 0}, {13368, 0, 0},
+ {22212, 0, 0}},
+ {{8120, 0, 0}, {7819, 0, 0}, {12060, 0, 0}, {8863, 0, 0},
+ {12267, 0, 0}, {23210, 0, 0}, {23345, 0, 0}, {2403, 0, 0},
+ {13515, 0, 0}}
+ },
+ {
+ {{6704, 0, 0}, {10670, 0, 0}, {13155, 0, 0}, {12243, 0, 0},
+ {15173, 0, 0}, {16150, 0, 0}, {12271, 0, 0}, {13779, 0, 0},
+ {17255, 0, 0}},
+ {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ }
+ }
+};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
+ [kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseEobContexts]
+ [kCoeffBaseEobSymbolCount + 1] = {
+ {
+ {
+ {{14931, 3713, 0, 0}, {3168, 1322, 0, 0}, {1924, 890, 0, 0},
+ {7842, 3820, 0, 0}},
+ {{11403, 2742, 0, 0}, {2256, 345, 0, 0}, {1110, 147, 0, 0},
+ {3138, 887, 0, 0}}
+ },
+ {
+ {{27051, 6291, 0, 0}, {2277, 1065, 0, 0}, {1218, 610, 0, 0},
+ {3120, 1277, 0, 0}},
+ {{20160, 4948, 0, 0}, {2088, 543, 0, 0}, {1959, 433, 0, 0},
+ {1469, 345, 0, 0}}
+ },
+ {
+ {{30982, 20156, 0, 0}, {2105, 1143, 0, 0}, {429, 300, 0, 0},
+ {1620, 935, 0, 0}},
+ {{13911, 8903, 0, 0}, {1340, 340, 0, 0}, {1024, 395, 0, 0},
+ {993, 242, 0, 0}}
+ },
+ {
+ {{30981, 30236, 0, 0}, {1936, 1106, 0, 0}, {944, 86, 0, 0},
+ {635, 199, 0, 0}},
+ {{19017, 10533, 0, 0}, {679, 359, 0, 0}, {5684, 4848, 0, 0},
+ {3477, 174, 0, 0}}
+ },
+ {
+ {{31043, 29319, 0, 0}, {1666, 833, 0, 0}, {311, 155, 0, 0},
+ {356, 119, 0, 0}},
+ {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}}
+ }
+ },
+ {
+ {
+ {{15208, 2880, 0, 0}, {3097, 1219, 0, 0}, {1761, 712, 0, 0},
+ {5482, 2762, 0, 0}},
+ {{6174, 1556, 0, 0}, {1560, 186, 0, 0}, {933, 131, 0, 0},
+ {2173, 562, 0, 0}}
+ },
+ {
+ {{17529, 2836, 0, 0}, {1453, 673, 0, 0}, {638, 334, 0, 0},
+ {1904, 772, 0, 0}},
+ {{6489, 1800, 0, 0}, {1626, 273, 0, 0}, {1055, 228, 0, 0},
+ {839, 174, 0, 0}}
+ },
+ {
+ {{30124, 7570, 0, 0}, {730, 317, 0, 0}, {129, 73, 0, 0},
+ {602, 250, 0, 0}},
+ {{15581, 5100, 0, 0}, {1054, 218, 0, 0}, {485, 90, 0, 0},
+ {838, 205, 0, 0}}
+ },
+ {
+ {{31724, 30511, 0, 0}, {2013, 845, 0, 0}, {560, 75, 0, 0},
+ {524, 153, 0, 0}},
+ {{11451, 6561, 0, 0}, {3635, 1900, 0, 0}, {3457, 1537, 0, 0},
+ {3111, 1681, 0, 0}}
+ },
+ {
+ {{32290, 30934, 0, 0}, {1763, 781, 0, 0}, {451, 44, 0, 0},
+ {1903, 120, 0, 0}},
+ {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}}
+ }
+ },
+ {
+ {
+ {{12676, 1994, 0, 0}, {2073, 748, 0, 0}, {1637, 665, 0, 0},
+ {4102, 1898, 0, 0}},
+ {{5510, 1673, 0, 0}, {964, 145, 0, 0}, {1005, 240, 0, 0},
+ {1330, 262, 0, 0}}
+ },
+ {
+ {{14719, 2279, 0, 0}, {1062, 482, 0, 0}, {605, 295, 0, 0},
+ {1218, 584, 0, 0}},
+ {{5652, 1926, 0, 0}, {797, 170, 0, 0}, {680, 192, 0, 0},
+ {701, 104, 0, 0}}
+ },
+ {
+ {{19914, 3675, 0, 0}, {496, 210, 0, 0}, {101, 39, 0, 0},
+ {462, 183, 0, 0}},
+ {{7292, 2402, 0, 0}, {599, 81, 0, 0}, {289, 79, 0, 0},
+ {1095, 134, 0, 0}}
+ },
+ {
+ {{29959, 13467, 0, 0}, {563, 146, 0, 0}, {430, 38, 0, 0},
+ {982, 152, 0, 0}},
+ {{10031, 3663, 0, 0}, {1958, 406, 0, 0}, {2754, 141, 0, 0},
+ {2240, 194, 0, 0}}
+ },
+ {
+ {{31833, 29386, 0, 0}, {1979, 859, 0, 0}, {302, 12, 0, 0},
+ {1908, 255, 0, 0}},
+ {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}}
+ }
+ },
+ {
+ {
+ {{10271, 1570, 0, 0}, {1053, 273, 0, 0}, {1162, 431, 0, 0},
+ {2380, 778, 0, 0}},
+ {{4891, 1184, 0, 0}, {598, 40, 0, 0}, {613, 80, 0, 0},
+ {549, 66, 0, 0}}
+ },
+ {
+ {{11311, 1725, 0, 0}, {817, 285, 0, 0}, {615, 206, 0, 0},
+ {1295, 553, 0, 0}},
+ {{5210, 1617, 0, 0}, {748, 128, 0, 0}, {671, 193, 0, 0},
+ {526, 49, 0, 0}}
+ },
+ {
+ {{12788, 2177, 0, 0}, {549, 171, 0, 0}, {187, 62, 0, 0},
+ {965, 481, 0, 0}},
+ {{6295, 2261, 0, 0}, {337, 45, 0, 0}, {572, 157, 0, 0},
+ {1180, 240, 0, 0}}
+ },
+ {
+ {{8121, 2305, 0, 0}, {356, 73, 0, 0}, {300, 48, 0, 0},
+ {1499, 245, 0, 0}},
+ {{4286, 1263, 0, 0}, {616, 67, 0, 0}, {1036, 170, 0, 0},
+ {1001, 56, 0, 0}}
+ },
+ {
+ {{20410, 7791, 0, 0}, {1437, 383, 0, 0}, {134, 12, 0, 0},
+ {2357, 220, 0, 0}},
+ {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}}
+ }
+ }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseCdf
+ [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1] = {
+ {{{{28734, 23838, 20041, 0, 0}, {14686, 3027, 891, 0, 0},
+ {20172, 6644, 2275, 0, 0}, {23322, 11650, 5763, 0, 0},
+ {26460, 17627, 11489, 0, 0}, {30305, 26411, 22985, 0, 0},
+ {12101, 2222, 839, 0, 0}, {19725, 6645, 2634, 0, 0},
+ {24617, 14011, 7990, 0, 0}, {27513, 19929, 14136, 0, 0},
+ {29948, 25562, 21607, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {17032, 5215, 2164, 0, 0},
+ {21558, 8974, 3981, 0, 0}, {26821, 18894, 13067, 0, 0},
+ {28553, 23445, 18877, 0, 0}, {29935, 26306, 22709, 0, 0},
+ {13163, 2375, 1186, 0, 0}, {19245, 6516, 2520, 0, 0},
+ {24322, 14146, 8256, 0, 0}, {28950, 22425, 16794, 0, 0},
+ {31287, 28651, 25972, 0, 0}, {10119, 1466, 578, 0, 0},
+ {17939, 5641, 2319, 0, 0}, {24455, 15066, 9464, 0, 0},
+ {29746, 24467, 19982, 0, 0}, {31232, 28356, 25584, 0, 0},
+ {10414, 2994, 1396, 0, 0}, {18045, 7296, 3554, 0, 0},
+ {26095, 19023, 14106, 0, 0}, {30700, 27002, 23446, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{26466, 16324, 11007, 0, 0}, {9728, 1230, 293, 0, 0},
+ {17572, 4316, 1272, 0, 0}, {22748, 9822, 4254, 0, 0},
+ {26235, 15906, 9267, 0, 0}, {29230, 22952, 17692, 0, 0},
+ {8324, 893, 243, 0, 0}, {16887, 3844, 1133, 0, 0},
+ {22846, 9895, 4302, 0, 0}, {26241, 15802, 9077, 0, 0},
+ {28654, 21465, 15548, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {12567, 1998, 559, 0, 0},
+ {18014, 4697, 1510, 0, 0}, {24390, 12582, 6251, 0, 0},
+ {26852, 17469, 10790, 0, 0}, {28500, 21185, 14867, 0, 0},
+ {8407, 743, 187, 0, 0}, {14095, 2663, 825, 0, 0},
+ {22572, 10524, 5192, 0, 0}, {27273, 18419, 12351, 0, 0},
+ {30092, 25353, 21270, 0, 0}, {8090, 810, 183, 0, 0},
+ {14139, 2862, 937, 0, 0}, {23404, 12044, 6453, 0, 0},
+ {28127, 20450, 14674, 0, 0}, {30010, 25381, 21189, 0, 0},
+ {7335, 926, 299, 0, 0}, {13973, 3479, 1357, 0, 0},
+ {25124, 15184, 9176, 0, 0}, {29360, 23754, 17721, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{28232, 22696, 18767, 0, 0}, {7309, 1352, 562, 0, 0},
+ {16163, 4720, 1950, 0, 0}, {21760, 9911, 5049, 0, 0},
+ {25853, 16500, 10453, 0, 0}, {30143, 25956, 22231, 0, 0},
+ {8511, 980, 269, 0, 0}, {15888, 3314, 889, 0, 0},
+ {20810, 7714, 2990, 0, 0}, {24852, 14050, 7684, 0, 0},
+ {29385, 23991, 19322, 0, 0}, {10048, 1165, 375, 0, 0},
+ {17808, 4643, 1433, 0, 0}, {23037, 10558, 4840, 0, 0},
+ {26464, 16936, 10491, 0, 0}, {29858, 24950, 20602, 0, 0},
+ {12393, 2141, 637, 0, 0}, {18864, 5484, 1881, 0, 0},
+ {23400, 11210, 5624, 0, 0}, {26831, 17802, 11649, 0, 0},
+ {30101, 25543, 21449, 0, 0}, {8798, 1298, 390, 0, 0},
+ {15595, 3034, 750, 0, 0}, {19973, 7327, 2803, 0, 0},
+ {23787, 13088, 6875, 0, 0}, {28040, 21396, 15866, 0, 0},
+ {8481, 971, 329, 0, 0}, {16065, 3623, 1072, 0, 0},
+ {21935, 9214, 4043, 0, 0}, {26300, 16202, 9711, 0, 0},
+ {30353, 26206, 22490, 0, 0}, {6158, 373, 109, 0, 0},
+ {14178, 2270, 651, 0, 0}, {20348, 7012, 2818, 0, 0},
+ {25129, 14022, 8058, 0, 0}, {29767, 24682, 20421, 0, 0},
+ {7692, 704, 188, 0, 0}, {14822, 2640, 740, 0, 0},
+ {20744, 7783, 3390, 0, 0}, {25251, 14378, 8464, 0, 0},
+ {29525, 23987, 19437, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{26731, 15997, 10811, 0, 0}, {7994, 1064, 342, 0, 0},
+ {15938, 4179, 1712, 0, 0}, {22166, 9940, 5008, 0, 0},
+ {26035, 15939, 9697, 0, 0}, {29518, 23854, 19212, 0, 0},
+ {7186, 548, 100, 0, 0}, {14109, 2426, 545, 0, 0},
+ {20222, 6619, 2253, 0, 0}, {24348, 12317, 5967, 0, 0},
+ {28132, 20348, 14424, 0, 0}, {5187, 406, 129, 0, 0},
+ {13781, 2685, 790, 0, 0}, {21441, 8520, 3684, 0, 0},
+ {25504, 15049, 8648, 0, 0}, {28773, 22000, 16599, 0, 0},
+ {6875, 937, 281, 0, 0}, {16191, 4181, 1389, 0, 0},
+ {22579, 10020, 4586, 0, 0}, {25936, 15674, 9212, 0, 0},
+ {29060, 22658, 17434, 0, 0}, {6864, 486, 112, 0, 0},
+ {13047, 1976, 492, 0, 0}, {19949, 6525, 2357, 0, 0},
+ {24196, 12154, 5877, 0, 0}, {27404, 18709, 12301, 0, 0},
+ {6188, 330, 91, 0, 0}, {11916, 1543, 428, 0, 0},
+ {20333, 7068, 2801, 0, 0}, {24077, 11943, 5792, 0, 0},
+ {28322, 20559, 15499, 0, 0}, {5418, 339, 72, 0, 0},
+ {11396, 1791, 496, 0, 0}, {20095, 7498, 2915, 0, 0},
+ {23560, 11843, 6128, 0, 0}, {27750, 19417, 14036, 0, 0},
+ {5417, 289, 55, 0, 0}, {11370, 1559, 381, 0, 0},
+ {20606, 7721, 2926, 0, 0}, {24872, 14077, 7449, 0, 0},
+ {28098, 19886, 13887, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{27281, 22308, 19060, 0, 0}, {11171, 4465, 2094, 0, 0},
+ {21731, 10815, 6292, 0, 0}, {24621, 14806, 9816, 0, 0},
+ {27526, 19707, 14236, 0, 0}, {30879, 27560, 24586, 0, 0},
+ {5994, 635, 178, 0, 0}, {14924, 3204, 1001, 0, 0},
+ {21078, 8330, 3597, 0, 0}, {25226, 14553, 8309, 0, 0},
+ {29775, 24718, 20449, 0, 0}, {4745, 440, 177, 0, 0},
+ {14117, 2642, 814, 0, 0}, {20604, 7622, 3179, 0, 0},
+ {25006, 14238, 7997, 0, 0}, {29276, 23585, 18848, 0, 0},
+ {5177, 760, 277, 0, 0}, {15619, 3915, 1258, 0, 0},
+ {21283, 8765, 3908, 0, 0}, {25071, 14682, 8558, 0, 0},
+ {29693, 24769, 20550, 0, 0}, {4500, 286, 114, 0, 0},
+ {13137, 1717, 364, 0, 0}, {18908, 5508, 1748, 0, 0},
+ {23163, 11155, 5174, 0, 0}, {27892, 20606, 14860, 0, 0},
+ {5520, 452, 192, 0, 0}, {13813, 2311, 693, 0, 0},
+ {20944, 8771, 3973, 0, 0}, {25422, 14572, 8121, 0, 0},
+ {29365, 23521, 18657, 0, 0}, {3057, 113, 33, 0, 0},
+ {11599, 1374, 351, 0, 0}, {19281, 5570, 1811, 0, 0},
+ {23940, 11085, 5154, 0, 0}, {28498, 21317, 15730, 0, 0},
+ {4060, 190, 37, 0, 0}, {12648, 1527, 286, 0, 0},
+ {19076, 5218, 1447, 0, 0}, {23350, 10254, 4329, 0, 0},
+ {27769, 19485, 13306, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{27095, 18466, 13057, 0, 0}, {6517, 2067, 934, 0, 0},
+ {19986, 8985, 4965, 0, 0}, {23641, 12111, 6960, 0, 0},
+ {26400, 16560, 11306, 0, 0}, {30303, 25591, 21946, 0, 0},
+ {2807, 205, 49, 0, 0}, {14450, 2877, 819, 0, 0},
+ {21407, 8254, 3411, 0, 0}, {24868, 13165, 7161, 0, 0},
+ {28766, 22178, 17222, 0, 0}, {3131, 458, 173, 0, 0},
+ {14472, 2855, 959, 0, 0}, {22624, 11253, 5897, 0, 0},
+ {27410, 18446, 12374, 0, 0}, {29701, 24406, 19422, 0, 0},
+ {4116, 298, 92, 0, 0}, {15230, 1997, 559, 0, 0},
+ {18844, 5886, 2274, 0, 0}, {22272, 9931, 4899, 0, 0},
+ {25532, 16372, 11147, 0, 0}, {2025, 81, 22, 0, 0},
+ {9762, 1092, 279, 0, 0}, {18274, 4940, 1648, 0, 0},
+ {22594, 9967, 4416, 0, 0}, {26526, 17487, 11725, 0, 0},
+ {6951, 525, 48, 0, 0}, {14150, 1401, 443, 0, 0},
+ {18771, 4450, 890, 0, 0}, {20513, 6234, 1385, 0, 0},
+ {23207, 11180, 4318, 0, 0}, {4580, 133, 44, 0, 0},
+ {10708, 403, 40, 0, 0}, {14666, 2078, 240, 0, 0},
+ {18572, 3904, 769, 0, 0}, {20506, 6976, 1903, 0, 0},
+ {8592, 659, 140, 0, 0}, {14488, 3087, 805, 0, 0},
+ {22563, 9065, 3104, 0, 0}, {24879, 12743, 5092, 0, 0},
+ {26708, 16025, 8798, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{27627, 25672, 24508, 0, 0}, {5582, 3746, 2979, 0, 0},
+ {26100, 20200, 17086, 0, 0}, {30596, 26587, 24130, 0, 0},
+ {31642, 29389, 28237, 0, 0}, {32325, 31407, 30514, 0, 0},
+ {6685, 1615, 332, 0, 0}, {19282, 8165, 4285, 0, 0},
+ {26260, 17928, 12858, 0, 0}, {29382, 23968, 19482, 0, 0},
+ {31238, 28446, 25714, 0, 0}, {3129, 688, 220, 0, 0},
+ {16871, 5216, 2478, 0, 0}, {24180, 12721, 7385, 0, 0},
+ {27879, 19429, 13499, 0, 0}, {30528, 25897, 22270, 0, 0},
+ {4603, 571, 251, 0, 0}, {12033, 2341, 1200, 0, 0},
+ {18443, 8097, 5076, 0, 0}, {27649, 20214, 14963, 0, 0},
+ {30958, 27327, 24507, 0, 0}, {1556, 44, 20, 0, 0},
+ {9416, 1002, 223, 0, 0}, {18099, 5198, 1709, 0, 0},
+ {24276, 11874, 5496, 0, 0}, {29124, 22574, 17564, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{30307, 25755, 23397, 0, 0}, {8019, 3168, 1782, 0, 0},
+ {23302, 13731, 10351, 0, 0}, {29184, 23488, 18368, 0, 0},
+ {31263, 28839, 27335, 0, 0}, {32091, 31268, 30032, 0, 0},
+ {8781, 2066, 651, 0, 0}, {19214, 8197, 3505, 0, 0},
+ {26557, 18212, 11613, 0, 0}, {29633, 21796, 17143, 0, 0},
+ {30333, 25641, 21341, 0, 0}, {1468, 236, 218, 0, 0},
+ {18011, 2403, 814, 0, 0}, {28363, 21156, 14215, 0, 0},
+ {32188, 28636, 25446, 0, 0}, {31073, 22599, 18644, 0, 0},
+ {2760, 486, 177, 0, 0}, {13524, 2660, 1020, 0, 0},
+ {21588, 8610, 3213, 0, 0}, {27118, 17796, 13559, 0, 0},
+ {30654, 27659, 24312, 0, 0}, {912, 52, 20, 0, 0},
+ {9756, 1104, 196, 0, 0}, {19074, 6112, 2132, 0, 0},
+ {24626, 13260, 6675, 0, 0}, {28515, 21813, 16044, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{32167, 31785, 31457, 0, 0}, {14043, 9362, 4681, 0, 0},
+ {27307, 24576, 21845, 0, 0}, {28987, 17644, 11343, 0, 0},
+ {30181, 25007, 20696, 0, 0}, {32662, 32310, 31958, 0, 0},
+ {10486, 3058, 874, 0, 0}, {24260, 11842, 6784, 0, 0},
+ {29042, 20055, 14685, 0, 0}, {31148, 25656, 21875, 0, 0},
+ {32039, 30532, 29273, 0, 0}, {2605, 294, 84, 0, 0},
+ {14464, 2304, 768, 0, 0}, {21325, 6242, 3121, 0, 0},
+ {26761, 17476, 11469, 0, 0}, {30534, 26065, 23831, 0, 0},
+ {1814, 591, 197, 0, 0}, {15405, 3206, 1692, 0, 0},
+ {23082, 10304, 5358, 0, 0}, {24576, 16384, 11378, 0, 0},
+ {31013, 24722, 21504, 0, 0}, {1600, 34, 20, 0, 0},
+ {10282, 1327, 297, 0, 0}, {19935, 7141, 3030, 0, 0},
+ {25788, 15389, 9646, 0, 0}, {29657, 23881, 19289, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+ {{{{26727, 20914, 16841, 0, 0}, {12442, 1863, 517, 0, 0},
+ {18604, 5937, 2043, 0, 0}, {23008, 12121, 6183, 0, 0},
+ {26352, 17815, 11549, 0, 0}, {29802, 25617, 21877, 0, 0},
+ {9201, 1394, 514, 0, 0}, {17790, 5352, 1822, 0, 0},
+ {23334, 12543, 6514, 0, 0}, {26110, 18210, 12233, 0, 0},
+ {28852, 24091, 19779, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {14680, 3223, 1181, 0, 0},
+ {19706, 6925, 2695, 0, 0}, {23828, 15941, 10517, 0, 0},
+ {25114, 19548, 14795, 0, 0}, {27035, 22452, 18312, 0, 0},
+ {9889, 1380, 654, 0, 0}, {17553, 4775, 1813, 0, 0},
+ {23371, 13323, 7790, 0, 0}, {29326, 22955, 17424, 0, 0},
+ {31400, 28832, 26236, 0, 0}, {7274, 735, 362, 0, 0},
+ {15996, 4805, 2050, 0, 0}, {23349, 14603, 9508, 0, 0},
+ {30091, 25267, 20971, 0, 0}, {31252, 28424, 25598, 0, 0},
+ {6212, 1314, 667, 0, 0}, {15640, 5733, 2660, 0, 0},
+ {24444, 17424, 12519, 0, 0}, {30865, 27072, 23299, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24313, 13765, 8400, 0, 0}, {9205, 747, 164, 0, 0},
+ {16531, 3322, 833, 0, 0}, {22044, 8769, 3410, 0, 0},
+ {26043, 15240, 8352, 0, 0}, {28841, 21841, 15943, 0, 0},
+ {6455, 480, 134, 0, 0}, {15338, 2673, 673, 0, 0},
+ {21652, 8162, 3089, 0, 0}, {25573, 14384, 7499, 0, 0},
+ {28042, 19916, 13453, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {9946, 1120, 285, 0, 0},
+ {16044, 3135, 839, 0, 0}, {22507, 9735, 4043, 0, 0},
+ {25739, 14928, 8240, 0, 0}, {27901, 18882, 11266, 0, 0},
+ {7470, 876, 277, 0, 0}, {14959, 3438, 1256, 0, 0},
+ {23100, 11439, 6189, 0, 0}, {27994, 19812, 13792, 0, 0},
+ {30446, 25738, 21228, 0, 0}, {7296, 848, 225, 0, 0},
+ {14811, 3381, 1136, 0, 0}, {23572, 12175, 6368, 0, 0},
+ {28088, 20063, 13566, 0, 0}, {29851, 24312, 19332, 0, 0},
+ {6297, 709, 194, 0, 0}, {14310, 2985, 859, 0, 0},
+ {24368, 13304, 6812, 0, 0}, {28956, 21795, 15562, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{25989, 19025, 15090, 0, 0}, {7962, 971, 311, 0, 0},
+ {15152, 3721, 1396, 0, 0}, {21705, 9593, 4765, 0, 0},
+ {26247, 16658, 10444, 0, 0}, {30004, 25264, 21114, 0, 0},
+ {7502, 401, 131, 0, 0}, {13714, 2215, 593, 0, 0},
+ {20629, 7556, 2961, 0, 0}, {25457, 14606, 8064, 0, 0},
+ {29371, 23604, 18694, 0, 0}, {6780, 560, 246, 0, 0},
+ {16515, 3856, 1242, 0, 0}, {23617, 11381, 5396, 0, 0},
+ {27080, 17853, 11272, 0, 0}, {30051, 25141, 20764, 0, 0},
+ {9624, 913, 325, 0, 0}, {16698, 4277, 1443, 0, 0},
+ {24066, 12301, 6251, 0, 0}, {27525, 18812, 12401, 0, 0},
+ {30147, 25433, 21201, 0, 0}, {6132, 428, 138, 0, 0},
+ {12778, 1718, 427, 0, 0}, {19525, 6663, 2453, 0, 0},
+ {24180, 13247, 6850, 0, 0}, {28051, 21183, 15464, 0, 0},
+ {6924, 476, 186, 0, 0}, {13678, 2133, 671, 0, 0},
+ {20805, 8222, 3829, 0, 0}, {26550, 16681, 10414, 0, 0},
+ {30428, 26160, 22342, 0, 0}, {4722, 192, 74, 0, 0},
+ {11590, 1455, 472, 0, 0}, {19282, 6584, 2898, 0, 0},
+ {25619, 14897, 9045, 0, 0}, {29935, 24810, 20509, 0, 0},
+ {5058, 240, 82, 0, 0}, {12094, 1692, 500, 0, 0},
+ {20355, 7813, 3525, 0, 0}, {26092, 15841, 9671, 0, 0},
+ {29802, 24435, 19849, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24129, 13429, 8339, 0, 0}, {8364, 931, 243, 0, 0},
+ {15771, 3343, 984, 0, 0}, {21515, 8534, 3619, 0, 0},
+ {26017, 15374, 8740, 0, 0}, {29278, 22938, 17577, 0, 0},
+ {6485, 297, 54, 0, 0}, {13169, 1600, 326, 0, 0},
+ {19622, 5814, 1875, 0, 0}, {24554, 12180, 5878, 0, 0},
+ {28069, 19687, 13468, 0, 0}, {4556, 310, 99, 0, 0},
+ {14174, 2452, 668, 0, 0}, {21549, 8360, 3534, 0, 0},
+ {25903, 15112, 8619, 0, 0}, {29090, 22406, 16762, 0, 0},
+ {6943, 632, 152, 0, 0}, {15455, 2915, 747, 0, 0},
+ {21571, 8297, 3296, 0, 0}, {25821, 14987, 8363, 0, 0},
+ {29000, 22108, 16507, 0, 0}, {5416, 268, 62, 0, 0},
+ {11918, 1300, 299, 0, 0}, {18747, 5061, 1635, 0, 0},
+ {23804, 11020, 4930, 0, 0}, {27331, 18103, 11581, 0, 0},
+ {6464, 276, 70, 0, 0}, {12359, 1388, 383, 0, 0},
+ {19086, 5546, 2136, 0, 0}, {23794, 11532, 6083, 0, 0},
+ {28534, 21103, 15834, 0, 0}, {6495, 411, 57, 0, 0},
+ {12096, 1526, 327, 0, 0}, {18596, 5514, 1866, 0, 0},
+ {22898, 10870, 5493, 0, 0}, {27604, 19262, 13498, 0, 0},
+ {6043, 309, 40, 0, 0}, {11777, 1326, 241, 0, 0},
+ {19697, 6334, 1957, 0, 0}, {24584, 12678, 6026, 0, 0},
+ {27965, 19513, 12873, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{25213, 17826, 14267, 0, 0}, {8358, 1590, 481, 0, 0},
+ {18374, 6030, 2515, 0, 0}, {24355, 13214, 7573, 0, 0},
+ {28002, 19844, 13983, 0, 0}, {30739, 26962, 23561, 0, 0},
+ {5992, 404, 105, 0, 0}, {14036, 2801, 837, 0, 0},
+ {21763, 8982, 3916, 0, 0}, {26302, 15859, 9258, 0, 0},
+ {29724, 24130, 19349, 0, 0}, {3560, 186, 64, 0, 0},
+ {12700, 1911, 560, 0, 0}, {20765, 7683, 3173, 0, 0},
+ {25821, 15018, 8579, 0, 0}, {29523, 23665, 18761, 0, 0},
+ {5409, 303, 99, 0, 0}, {13347, 2154, 594, 0, 0},
+ {20853, 7758, 3189, 0, 0}, {25818, 15092, 8694, 0, 0},
+ {29761, 24295, 19672, 0, 0}, {3766, 92, 33, 0, 0},
+ {10666, 919, 192, 0, 0}, {18360, 4759, 1363, 0, 0},
+ {23741, 11089, 4837, 0, 0}, {28074, 20090, 14020, 0, 0},
+ {4552, 240, 86, 0, 0}, {11919, 1504, 450, 0, 0},
+ {20012, 6953, 3017, 0, 0}, {25203, 13967, 7845, 0, 0},
+ {29259, 23235, 18291, 0, 0}, {2635, 81, 29, 0, 0},
+ {9705, 858, 253, 0, 0}, {18180, 4717, 1636, 0, 0},
+ {23683, 11119, 5311, 0, 0}, {28507, 21114, 15504, 0, 0},
+ {3250, 77, 20, 0, 0}, {10317, 809, 155, 0, 0},
+ {17904, 4046, 1068, 0, 0}, {23073, 9804, 4052, 0, 0},
+ {27836, 19410, 13266, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{26303, 15810, 11080, 0, 0}, {7569, 1254, 408, 0, 0},
+ {17994, 5619, 2161, 0, 0}, {23511, 11330, 5796, 0, 0},
+ {27045, 17585, 10886, 0, 0}, {29618, 23889, 19037, 0, 0},
+ {5779, 506, 86, 0, 0}, {15372, 2831, 683, 0, 0},
+ {21381, 7867, 2984, 0, 0}, {25479, 13947, 7220, 0, 0},
+ {29034, 22191, 16682, 0, 0}, {3040, 267, 73, 0, 0},
+ {15337, 3067, 865, 0, 0}, {22847, 9942, 4468, 0, 0},
+ {26872, 17334, 10700, 0, 0}, {29338, 23122, 18011, 0, 0},
+ {4154, 257, 63, 0, 0}, {13404, 2130, 505, 0, 0},
+ {19639, 6514, 2366, 0, 0}, {24014, 12284, 6328, 0, 0},
+ {28390, 21161, 15658, 0, 0}, {2476, 97, 24, 0, 0},
+ {10988, 1165, 267, 0, 0}, {18454, 4939, 1477, 0, 0},
+ {23157, 10441, 4505, 0, 0}, {27878, 19681, 13703, 0, 0},
+ {6906, 201, 35, 0, 0}, {11974, 718, 201, 0, 0},
+ {15525, 2143, 514, 0, 0}, {19485, 5140, 1294, 0, 0},
+ {23099, 10236, 3850, 0, 0}, {5333, 71, 20, 0, 0},
+ {7846, 378, 54, 0, 0}, {11319, 1264, 232, 0, 0},
+ {16376, 3039, 936, 0, 0}, {21076, 7884, 3692, 0, 0},
+ {8575, 478, 33, 0, 0}, {13859, 1664, 205, 0, 0},
+ {20532, 5927, 1365, 0, 0}, {24597, 10928, 3686, 0, 0},
+ {25544, 15488, 7493, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{29690, 25929, 22878, 0, 0}, {18931, 12318, 8289, 0, 0},
+ {26854, 18546, 13440, 0, 0}, {28902, 22501, 18006, 0, 0},
+ {30156, 25560, 21726, 0, 0}, {31701, 29777, 27992, 0, 0},
+ {6951, 1122, 239, 0, 0}, {19060, 6430, 2383, 0, 0},
+ {25440, 14183, 7898, 0, 0}, {28077, 19688, 13492, 0, 0},
+ {30943, 27515, 24416, 0, 0}, {3382, 453, 144, 0, 0},
+ {15608, 3767, 1408, 0, 0}, {23166, 10906, 5372, 0, 0},
+ {26853, 16996, 10620, 0, 0}, {29982, 24989, 20721, 0, 0},
+ {3522, 318, 105, 0, 0}, {14072, 2839, 950, 0, 0},
+ {22258, 9399, 4208, 0, 0}, {26539, 16269, 9643, 0, 0},
+ {30160, 25320, 21063, 0, 0}, {2015, 58, 20, 0, 0},
+ {11130, 1281, 265, 0, 0}, {19831, 5914, 1898, 0, 0},
+ {24586, 12172, 5798, 0, 0}, {29131, 22499, 17271, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{27524, 20618, 15862, 0, 0}, {12282, 5910, 3067, 0, 0},
+ {25012, 14451, 9033, 0, 0}, {29316, 23512, 19622, 0, 0},
+ {30748, 27562, 24539, 0, 0}, {30967, 27775, 24865, 0, 0},
+ {5717, 910, 237, 0, 0}, {16780, 5237, 2149, 0, 0},
+ {23580, 11284, 6049, 0, 0}, {26495, 15582, 8968, 0, 0},
+ {29660, 23413, 18004, 0, 0}, {1692, 248, 88, 0, 0},
+ {14649, 2731, 918, 0, 0}, {22524, 9799, 5296, 0, 0},
+ {28076, 18691, 13495, 0, 0}, {29074, 21091, 15212, 0, 0},
+ {2708, 187, 48, 0, 0}, {11757, 1993, 648, 0, 0},
+ {20837, 7948, 3479, 0, 0}, {25649, 15106, 8412, 0, 0},
+ {28935, 22062, 16464, 0, 0}, {814, 37, 20, 0, 0},
+ {8855, 1044, 279, 0, 0}, {17248, 4708, 1482, 0, 0},
+ {21251, 9760, 4197, 0, 0}, {26575, 18260, 12139, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{31733, 29961, 28612, 0, 0}, {19606, 14630, 11829, 0, 0},
+ {30072, 26135, 24013, 0, 0}, {31395, 28607, 25915, 0, 0},
+ {31669, 30022, 28052, 0, 0}, {32428, 31747, 31169, 0, 0},
+ {9942, 2349, 633, 0, 0}, {22373, 11006, 5826, 0, 0},
+ {28042, 20361, 15407, 0, 0}, {30321, 25688, 22175, 0, 0},
+ {31541, 29051, 26757, 0, 0}, {4612, 1344, 834, 0, 0},
+ {15853, 5014, 2395, 0, 0}, {23620, 11778, 6337, 0, 0},
+ {26818, 17253, 11620, 0, 0}, {30276, 25441, 21242, 0, 0},
+ {2166, 291, 98, 0, 0}, {12742, 2813, 1200, 0, 0},
+ {21548, 9140, 4663, 0, 0}, {26116, 15749, 9795, 0, 0},
+ {29704, 24232, 19725, 0, 0}, {999, 44, 20, 0, 0},
+ {10538, 1881, 395, 0, 0}, {20534, 7689, 3037, 0, 0},
+ {25442, 13952, 7415, 0, 0}, {28835, 21861, 16152, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+ {{{{23872, 16541, 12138, 0, 0}, {9139, 986, 241, 0, 0},
+ {17595, 5013, 1447, 0, 0}, {22610, 11535, 5386, 0, 0},
+ {26348, 17911, 11210, 0, 0}, {29499, 24613, 20122, 0, 0},
+ {7933, 759, 272, 0, 0}, {16259, 4347, 1189, 0, 0},
+ {21811, 11254, 5350, 0, 0}, {24887, 16838, 10672, 0, 0},
+ {27380, 21808, 16850, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {12023, 1995, 675, 0, 0},
+ {17568, 5547, 1907, 0, 0}, {19736, 11895, 7101, 0, 0},
+ {20483, 14105, 9274, 0, 0}, {21205, 15287, 11279, 0, 0},
+ {6508, 786, 448, 0, 0}, {17371, 4685, 1668, 0, 0},
+ {23026, 13551, 7944, 0, 0}, {29507, 23139, 17406, 0, 0},
+ {31288, 28446, 25269, 0, 0}, {5169, 512, 308, 0, 0},
+ {15911, 5109, 1994, 0, 0}, {23217, 14478, 9020, 0, 0},
+ {29716, 23835, 18665, 0, 0}, {30747, 26858, 22981, 0, 0},
+ {3763, 753, 376, 0, 0}, {15091, 5074, 1905, 0, 0},
+ {23564, 15412, 9549, 0, 0}, {30365, 25252, 19954, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{21960, 10712, 5872, 0, 0}, {7029, 455, 92, 0, 0},
+ {15480, 2565, 547, 0, 0}, {21409, 7890, 2872, 0, 0},
+ {25819, 15001, 7875, 0, 0}, {28481, 20972, 14697, 0, 0},
+ {4888, 247, 63, 0, 0}, {13730, 1764, 354, 0, 0},
+ {20204, 6423, 2000, 0, 0}, {24499, 12821, 5989, 0, 0},
+ {27094, 18111, 11094, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {7026, 449, 97, 0, 0},
+ {13211, 1604, 314, 0, 0}, {19387, 6387, 2013, 0, 0},
+ {22667, 11302, 6046, 0, 0}, {23559, 13118, 5943, 0, 0},
+ {5661, 851, 336, 0, 0}, {14712, 3875, 1565, 0, 0},
+ {22568, 11334, 6004, 0, 0}, {28108, 19855, 13266, 0, 0},
+ {30400, 25838, 20264, 0, 0}, {5808, 610, 155, 0, 0},
+ {14140, 2763, 737, 0, 0}, {22535, 10326, 4536, 0, 0},
+ {27297, 18138, 11252, 0, 0}, {29533, 22001, 15659, 0, 0},
+ {5072, 328, 76, 0, 0}, {12736, 1601, 330, 0, 0},
+ {24068, 11427, 4326, 0, 0}, {27106, 17937, 10973, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{23064, 15474, 11636, 0, 0}, {6006, 490, 135, 0, 0},
+ {14386, 3148, 949, 0, 0}, {21877, 9293, 4045, 0, 0},
+ {26410, 16185, 9459, 0, 0}, {29520, 23650, 18627, 0, 0},
+ {5564, 195, 69, 0, 0}, {12950, 1944, 439, 0, 0},
+ {20996, 7648, 2727, 0, 0}, {25773, 14735, 7729, 0, 0},
+ {29016, 22326, 16670, 0, 0}, {5546, 512, 209, 0, 0},
+ {17412, 4369, 1293, 0, 0}, {23947, 12133, 5711, 0, 0},
+ {27257, 18364, 11529, 0, 0}, {29833, 24546, 19717, 0, 0},
+ {7893, 648, 239, 0, 0}, {17535, 4503, 1323, 0, 0},
+ {24163, 12198, 5836, 0, 0}, {27337, 18355, 11572, 0, 0},
+ {29774, 24427, 19545, 0, 0}, {4567, 164, 68, 0, 0},
+ {11727, 1322, 312, 0, 0}, {19547, 6555, 2293, 0, 0},
+ {24513, 13383, 6731, 0, 0}, {27838, 20183, 13938, 0, 0},
+ {4000, 320, 141, 0, 0}, {13063, 2207, 747, 0, 0},
+ {21196, 9179, 4548, 0, 0}, {27236, 17734, 11322, 0, 0},
+ {30308, 25618, 21312, 0, 0}, {2894, 149, 69, 0, 0},
+ {11147, 1697, 567, 0, 0}, {20257, 8021, 3776, 0, 0},
+ {26487, 16373, 10020, 0, 0}, {29522, 23490, 18271, 0, 0},
+ {3053, 143, 56, 0, 0}, {11810, 1757, 485, 0, 0},
+ {21535, 9097, 3962, 0, 0}, {26756, 16640, 9900, 0, 0},
+ {29341, 22917, 17354, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{21752, 10657, 5974, 0, 0}, {6822, 411, 91, 0, 0},
+ {14878, 2316, 516, 0, 0}, {21090, 7626, 2952, 0, 0},
+ {26048, 15234, 8184, 0, 0}, {28538, 21103, 14948, 0, 0},
+ {4368, 145, 21, 0, 0}, {11604, 1100, 193, 0, 0},
+ {19196, 5380, 1586, 0, 0}, {24534, 12018, 5410, 0, 0},
+ {27703, 18713, 11871, 0, 0}, {3787, 221, 63, 0, 0},
+ {14087, 2225, 529, 0, 0}, {21849, 8693, 3482, 0, 0},
+ {26337, 15569, 8691, 0, 0}, {28949, 22304, 16150, 0, 0},
+ {5898, 301, 75, 0, 0}, {13727, 1937, 421, 0, 0},
+ {20974, 7557, 2752, 0, 0}, {25880, 14749, 7798, 0, 0},
+ {28398, 20405, 13776, 0, 0}, {3190, 98, 24, 0, 0},
+ {9609, 761, 155, 0, 0}, {17453, 4099, 1092, 0, 0},
+ {23470, 10161, 3986, 0, 0}, {26624, 16855, 9800, 0, 0},
+ {4658, 269, 99, 0, 0}, {11194, 1831, 753, 0, 0},
+ {20009, 7950, 4041, 0, 0}, {26223, 16007, 9726, 0, 0},
+ {29119, 22171, 15935, 0, 0}, {4605, 216, 40, 0, 0},
+ {10667, 1299, 304, 0, 0}, {19608, 7296, 2625, 0, 0},
+ {25465, 14084, 7300, 0, 0}, {27527, 18793, 11813, 0, 0},
+ {4368, 137, 24, 0, 0}, {10664, 975, 165, 0, 0},
+ {19211, 6197, 1922, 0, 0}, {25019, 12907, 6093, 0, 0},
+ {27895, 18738, 11534, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{22968, 15133, 11695, 0, 0}, {6615, 883, 241, 0, 0},
+ {17730, 4916, 1762, 0, 0}, {24050, 12204, 6282, 0, 0},
+ {27640, 18692, 12254, 0, 0}, {30132, 25202, 20843, 0, 0},
+ {5217, 264, 67, 0, 0}, {14458, 2714, 668, 0, 0},
+ {22557, 9348, 3686, 0, 0}, {26546, 15892, 8852, 0, 0},
+ {29306, 22814, 17270, 0, 0}, {2777, 135, 47, 0, 0},
+ {12885, 2017, 567, 0, 0}, {21627, 8584, 3483, 0, 0},
+ {26348, 15828, 8994, 0, 0}, {29376, 23015, 17650, 0, 0},
+ {4303, 152, 56, 0, 0}, {12918, 2066, 524, 0, 0},
+ {21785, 8744, 3545, 0, 0}, {26474, 15998, 9186, 0, 0},
+ {29524, 23485, 18259, 0, 0}, {2745, 51, 20, 0, 0},
+ {9828, 736, 142, 0, 0}, {18486, 4840, 1295, 0, 0},
+ {24206, 11441, 4854, 0, 0}, {27922, 19375, 12849, 0, 0},
+ {2787, 178, 73, 0, 0}, {12303, 1805, 602, 0, 0},
+ {21289, 9189, 4573, 0, 0}, {26852, 17120, 10695, 0, 0},
+ {29737, 24163, 19370, 0, 0}, {1622, 77, 29, 0, 0},
+ {9662, 1044, 324, 0, 0}, {18985, 6030, 2329, 0, 0},
+ {24916, 13300, 6961, 0, 0}, {28908, 21644, 15915, 0, 0},
+ {1754, 44, 20, 0, 0}, {9139, 659, 140, 0, 0},
+ {18021, 4653, 1365, 0, 0}, {24223, 11526, 5290, 0, 0},
+ {28194, 19987, 13701, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{23583, 13074, 8080, 0, 0}, {6687, 783, 147, 0, 0},
+ {16753, 3768, 981, 0, 0}, {22226, 9078, 3562, 0, 0},
+ {26036, 14823, 8091, 0, 0}, {28852, 21729, 16046, 0, 0},
+ {4544, 202, 24, 0, 0}, {13668, 1630, 283, 0, 0},
+ {20240, 6148, 1889, 0, 0}, {25027, 12491, 5883, 0, 0},
+ {28202, 19923, 13778, 0, 0}, {2835, 175, 50, 0, 0},
+ {15098, 2435, 613, 0, 0}, {22383, 9168, 3859, 0, 0},
+ {26525, 16532, 10361, 0, 0}, {28792, 22379, 16751, 0, 0},
+ {4391, 207, 30, 0, 0}, {13402, 1593, 286, 0, 0},
+ {19441, 5593, 1674, 0, 0}, {24510, 11999, 5625, 0, 0},
+ {28065, 19570, 13241, 0, 0}, {1682, 62, 20, 0, 0},
+ {9915, 866, 185, 0, 0}, {18009, 4582, 1349, 0, 0},
+ {23484, 10386, 4420, 0, 0}, {27183, 17576, 10900, 0, 0},
+ {4477, 116, 22, 0, 0}, {12919, 661, 197, 0, 0},
+ {17934, 5950, 3554, 0, 0}, {22462, 10174, 4096, 0, 0},
+ {26153, 15384, 9384, 0, 0}, {3821, 164, 23, 0, 0},
+ {7143, 479, 122, 0, 0}, {14010, 4096, 1365, 0, 0},
+ {22751, 9338, 4245, 0, 0}, {25906, 17499, 10637, 0, 0},
+ {8835, 259, 29, 0, 0}, {12841, 1273, 137, 0, 0},
+ {20865, 6745, 2147, 0, 0}, {25742, 12674, 5516, 0, 0},
+ {26770, 14662, 8331, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{28312, 21494, 17235, 0, 0}, {11549, 3689, 1152, 0, 0},
+ {21595, 8994, 4201, 0, 0}, {25486, 14475, 8505, 0, 0},
+ {27878, 19482, 13653, 0, 0}, {30878, 27260, 24109, 0, 0},
+ {6117, 632, 121, 0, 0}, {18138, 4514, 1313, 0, 0},
+ {24052, 11481, 5373, 0, 0}, {27153, 17437, 10760, 0, 0},
+ {30093, 25068, 20618, 0, 0}, {2814, 242, 78, 0, 0},
+ {16642, 3786, 1135, 0, 0}, {23738, 11407, 5416, 0, 0},
+ {27357, 17975, 11497, 0, 0}, {29825, 24346, 19605, 0, 0},
+ {3229, 167, 38, 0, 0}, {14643, 2383, 567, 0, 0},
+ {22346, 8678, 3300, 0, 0}, {26300, 15281, 8330, 0, 0},
+ {29798, 24115, 19237, 0, 0}, {1856, 53, 20, 0, 0},
+ {12102, 1395, 271, 0, 0}, {20259, 6128, 1851, 0, 0},
+ {24710, 12139, 5478, 0, 0}, {28537, 20762, 14716, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{22566, 12135, 7284, 0, 0}, {5432, 1323, 416, 0, 0},
+ {20348, 8384, 4216, 0, 0}, {25120, 14653, 8912, 0, 0},
+ {27106, 18427, 12866, 0, 0}, {29157, 22440, 17378, 0, 0},
+ {1823, 152, 32, 0, 0}, {14086, 2263, 515, 0, 0},
+ {21255, 7432, 2565, 0, 0}, {25319, 13316, 6620, 0, 0},
+ {28286, 19717, 13882, 0, 0}, {746, 78, 21, 0, 0},
+ {14190, 2267, 622, 0, 0}, {21519, 9400, 4137, 0, 0},
+ {27123, 15810, 10610, 0, 0}, {27759, 21324, 16131, 0, 0},
+ {1411, 58, 20, 0, 0}, {11216, 1274, 264, 0, 0},
+ {18877, 5091, 1428, 0, 0}, {23717, 10670, 4596, 0, 0},
+ {27578, 19391, 13282, 0, 0}, {404, 28, 20, 0, 0},
+ {7929, 861, 217, 0, 0}, {15608, 3989, 1072, 0, 0},
+ {20316, 8631, 3166, 0, 0}, {26603, 17379, 10291, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{30193, 25487, 21691, 0, 0}, {18766, 11902, 7366, 0, 0},
+ {26425, 17712, 13110, 0, 0}, {28294, 20910, 15727, 0, 0},
+ {29903, 24469, 20234, 0, 0}, {31424, 28819, 26377, 0, 0},
+ {8048, 1529, 309, 0, 0}, {20183, 7412, 2800, 0, 0},
+ {25587, 14522, 8324, 0, 0}, {27743, 19101, 12883, 0, 0},
+ {30247, 25464, 21163, 0, 0}, {2860, 516, 184, 0, 0},
+ {15347, 3612, 1193, 0, 0}, {22879, 10580, 4986, 0, 0},
+ {26890, 17121, 10645, 0, 0}, {29954, 24103, 19445, 0, 0},
+ {2585, 200, 55, 0, 0}, {14240, 2573, 719, 0, 0},
+ {21786, 8162, 3111, 0, 0}, {25811, 14603, 7537, 0, 0},
+ {29260, 22650, 17300, 0, 0}, {1007, 32, 20, 0, 0},
+ {11727, 1440, 222, 0, 0}, {20200, 6036, 1602, 0, 0},
+ {24716, 12048, 5035, 0, 0}, {28432, 20576, 14372, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+ {{{{25706, 16296, 10449, 0, 0}, {8230, 507, 94, 0, 0},
+ {19093, 4727, 989, 0, 0}, {24178, 12094, 5137, 0, 0},
+ {27083, 18093, 10755, 0, 0}, {29113, 22870, 17037, 0, 0},
+ {6275, 350, 110, 0, 0}, {16392, 3426, 678, 0, 0},
+ {22174, 10119, 3798, 0, 0}, {24592, 15598, 8465, 0, 0},
+ {27163, 20074, 13629, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {8880, 866, 226, 0, 0},
+ {14156, 3081, 781, 0, 0}, {16523, 7916, 3519, 0, 0},
+ {17003, 10160, 5209, 0, 0}, {12873, 8069, 5258, 0, 0},
+ {4367, 556, 311, 0, 0}, {17494, 4943, 1788, 0, 0},
+ {23404, 14640, 8436, 0, 0}, {30485, 24575, 17686, 0, 0},
+ {31540, 28796, 24887, 0, 0}, {3313, 299, 148, 0, 0},
+ {14787, 4523, 1380, 0, 0}, {21847, 12670, 6528, 0, 0},
+ {29025, 20939, 14111, 0, 0}, {30394, 23175, 17053, 0, 0},
+ {1700, 302, 133, 0, 0}, {12447, 3196, 797, 0, 0},
+ {21997, 12513, 5649, 0, 0}, {29973, 22358, 15407, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{23448, 10666, 4928, 0, 0}, {5711, 304, 44, 0, 0},
+ {16437, 2500, 459, 0, 0}, {22449, 8833, 3048, 0, 0},
+ {26579, 16320, 8662, 0, 0}, {29179, 21884, 13960, 0, 0},
+ {3742, 144, 20, 0, 0}, {13542, 1261, 181, 0, 0},
+ {20076, 5847, 1565, 0, 0}, {25719, 13236, 5133, 0, 0},
+ {25041, 17099, 9516, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {4712, 143, 20, 0, 0},
+ {10385, 693, 99, 0, 0}, {17351, 5670, 1019, 0, 0},
+ {14641, 6275, 5578, 0, 0}, {27307, 16384, 10923, 0, 0},
+ {4786, 677, 184, 0, 0}, {13723, 2900, 796, 0, 0},
+ {22371, 10502, 4836, 0, 0}, {26778, 19071, 11268, 0, 0},
+ {30976, 25856, 17664, 0, 0}, {4570, 267, 50, 0, 0},
+ {11234, 1247, 199, 0, 0}, {21659, 7551, 2751, 0, 0},
+ {27097, 17644, 6617, 0, 0}, {28087, 18725, 14043, 0, 0},
+ {4080, 188, 27, 0, 0}, {10192, 689, 107, 0, 0},
+ {22141, 10627, 4428, 0, 0}, {23406, 18725, 4681, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{25014, 15820, 10626, 0, 0}, {7098, 438, 77, 0, 0},
+ {17105, 3543, 774, 0, 0}, {22890, 9480, 3610, 0, 0},
+ {26349, 15680, 8432, 0, 0}, {28909, 21765, 15729, 0, 0},
+ {5206, 173, 43, 0, 0}, {15193, 2180, 369, 0, 0},
+ {21949, 7930, 2459, 0, 0}, {25644, 14082, 6852, 0, 0},
+ {28289, 20080, 13428, 0, 0}, {4383, 292, 95, 0, 0},
+ {17462, 3763, 830, 0, 0}, {23831, 11153, 4446, 0, 0},
+ {26786, 17165, 9982, 0, 0}, {29148, 22501, 16632, 0, 0},
+ {5488, 304, 101, 0, 0}, {17161, 3608, 764, 0, 0},
+ {23677, 10633, 4028, 0, 0}, {26536, 16136, 8748, 0, 0},
+ {28721, 21391, 15096, 0, 0}, {3548, 138, 50, 0, 0},
+ {13118, 1548, 306, 0, 0}, {19718, 6456, 1941, 0, 0},
+ {23540, 11898, 5300, 0, 0}, {26622, 17619, 10797, 0, 0},
+ {2599, 287, 145, 0, 0}, {15556, 3457, 1214, 0, 0},
+ {22857, 11457, 5886, 0, 0}, {28281, 19454, 12396, 0, 0},
+ {30198, 24996, 19879, 0, 0}, {1844, 155, 60, 0, 0},
+ {13278, 2562, 661, 0, 0}, {21536, 8770, 3492, 0, 0},
+ {25999, 14813, 7733, 0, 0}, {28370, 20145, 13554, 0, 0},
+ {2159, 141, 46, 0, 0}, {13398, 2186, 481, 0, 0},
+ {22311, 9149, 3359, 0, 0}, {26325, 15131, 7934, 0, 0},
+ {28123, 19532, 12662, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24142, 12497, 6552, 0, 0}, {6061, 362, 57, 0, 0},
+ {15769, 2439, 482, 0, 0}, {21323, 7645, 2482, 0, 0},
+ {26357, 13940, 7167, 0, 0}, {25967, 20310, 12520, 0, 0},
+ {2850, 86, 20, 0, 0}, {12119, 1029, 150, 0, 0},
+ {19889, 4995, 1187, 0, 0}, {24872, 11017, 4524, 0, 0},
+ {27508, 17898, 9070, 0, 0}, {3516, 175, 37, 0, 0},
+ {15696, 2308, 474, 0, 0}, {22115, 8625, 3403, 0, 0},
+ {26232, 15278, 8785, 0, 0}, {27839, 19598, 12683, 0, 0},
+ {4631, 250, 53, 0, 0}, {14597, 1984, 361, 0, 0},
+ {21331, 7332, 2309, 0, 0}, {25516, 14234, 6592, 0, 0},
+ {28642, 19415, 11790, 0, 0}, {1606, 42, 20, 0, 0},
+ {9751, 546, 67, 0, 0}, {17139, 3535, 722, 0, 0},
+ {23381, 10147, 3288, 0, 0}, {25846, 15152, 7758, 0, 0},
+ {3930, 503, 154, 0, 0}, {13067, 2562, 848, 0, 0},
+ {21554, 10358, 4835, 0, 0}, {27448, 18591, 9734, 0, 0},
+ {27719, 19887, 14941, 0, 0}, {5284, 297, 34, 0, 0},
+ {11692, 1242, 207, 0, 0}, {20061, 6465, 1557, 0, 0},
+ {24599, 11046, 4549, 0, 0}, {26723, 13362, 5726, 0, 0},
+ {5015, 196, 23, 0, 0}, {11936, 890, 115, 0, 0},
+ {19518, 5412, 1094, 0, 0}, {25050, 11260, 2910, 0, 0},
+ {25559, 14418, 7209, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{24892, 15867, 11027, 0, 0}, {8767, 870, 143, 0, 0},
+ {18239, 4809, 1317, 0, 0}, {24495, 11950, 5510, 0, 0},
+ {27490, 18095, 11258, 0, 0}, {29785, 23925, 18729, 0, 0},
+ {4752, 194, 36, 0, 0}, {15297, 2462, 467, 0, 0},
+ {22544, 8705, 3040, 0, 0}, {26166, 14814, 7716, 0, 0},
+ {28766, 21183, 15009, 0, 0}, {2578, 134, 29, 0, 0},
+ {15271, 2486, 498, 0, 0}, {22539, 9039, 3230, 0, 0},
+ {26424, 15557, 8328, 0, 0}, {28919, 21579, 15660, 0, 0},
+ {4198, 185, 42, 0, 0}, {15247, 2607, 530, 0, 0},
+ {22615, 9203, 3390, 0, 0}, {26313, 15427, 8325, 0, 0},
+ {28861, 21726, 15744, 0, 0}, {2079, 53, 20, 0, 0},
+ {11222, 928, 158, 0, 0}, {19221, 5187, 1309, 0, 0},
+ {23856, 11011, 4459, 0, 0}, {27220, 17688, 10722, 0, 0},
+ {1985, 228, 83, 0, 0}, {15228, 3240, 1100, 0, 0},
+ {22608, 11300, 5985, 0, 0}, {28044, 19375, 12714, 0, 0},
+ {30066, 24594, 19666, 0, 0}, {1120, 82, 26, 0, 0},
+ {11814, 1674, 431, 0, 0}, {20348, 7070, 2589, 0, 0},
+ {25464, 13448, 6520, 0, 0}, {28402, 20507, 13904, 0, 0},
+ {1187, 45, 20, 0, 0}, {11395, 1182, 243, 0, 0},
+ {20024, 6143, 1883, 0, 0}, {25337, 12446, 5818, 0, 0},
+ {28076, 19445, 12657, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24935, 14399, 8673, 0, 0}, {6118, 495, 66, 0, 0},
+ {16397, 2807, 577, 0, 0}, {21713, 8686, 3139, 0, 0},
+ {25876, 14124, 7368, 0, 0}, {27762, 19711, 13528, 0, 0},
+ {2934, 102, 20, 0, 0}, {13191, 1433, 198, 0, 0},
+ {20515, 6259, 1646, 0, 0}, {24777, 11996, 5057, 0, 0},
+ {27091, 16858, 9709, 0, 0}, {2659, 236, 48, 0, 0},
+ {16021, 2602, 516, 0, 0}, {22634, 9226, 3584, 0, 0},
+ {26977, 16592, 9212, 0, 0}, {28406, 22354, 15484, 0, 0},
+ {3276, 142, 20, 0, 0}, {12874, 1366, 243, 0, 0},
+ {19826, 5697, 1899, 0, 0}, {24422, 11552, 5363, 0, 0},
+ {26196, 15681, 8909, 0, 0}, {733, 33, 20, 0, 0},
+ {9811, 930, 150, 0, 0}, {18044, 4196, 996, 0, 0},
+ {22404, 8769, 3215, 0, 0}, {25764, 14335, 7113, 0, 0},
+ {5240, 491, 87, 0, 0}, {15809, 1597, 672, 0, 0},
+ {22282, 9175, 4806, 0, 0}, {24576, 16384, 9557, 0, 0},
+ {23831, 14895, 11916, 0, 0}, {5053, 766, 153, 0, 0},
+ {17695, 3277, 1092, 0, 0}, {21504, 8192, 4096, 0, 0},
+ {30427, 14043, 9362, 0, 0}, {25486, 14564, 7282, 0, 0},
+ {4221, 555, 111, 0, 0}, {11980, 2995, 529, 0, 0},
+ {25988, 11299, 2260, 0, 0}, {26810, 17873, 8937, 0, 0},
+ {16384, 10923, 5461, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{26776, 18464, 13003, 0, 0}, {10156, 1530, 312, 0, 0},
+ {19312, 5606, 1681, 0, 0}, {24767, 12706, 6264, 0, 0},
+ {27600, 18663, 12004, 0, 0}, {30136, 24997, 20383, 0, 0},
+ {5734, 424, 59, 0, 0}, {16918, 3353, 771, 0, 0},
+ {23274, 9992, 3927, 0, 0}, {26617, 15938, 8799, 0, 0},
+ {29307, 22729, 17046, 0, 0}, {2634, 199, 37, 0, 0},
+ {17130, 3346, 823, 0, 0}, {23618, 10903, 4550, 0, 0},
+ {27121, 17049, 10092, 0, 0}, {29366, 22996, 17291, 0, 0},
+ {4238, 182, 33, 0, 0}, {15629, 2470, 476, 0, 0},
+ {22568, 8729, 3083, 0, 0}, {26349, 15094, 7982, 0, 0},
+ {29224, 22543, 16944, 0, 0}, {1435, 42, 20, 0, 0},
+ {12150, 1281, 224, 0, 0}, {19867, 5551, 1536, 0, 0},
+ {24144, 11034, 4597, 0, 0}, {27664, 18577, 12020, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{21562, 11678, 6207, 0, 0}, {4009, 489, 97, 0, 0},
+ {18597, 4816, 1199, 0, 0}, {23025, 9861, 3627, 0, 0},
+ {25897, 14882, 7900, 0, 0}, {27808, 19616, 13453, 0, 0},
+ {1691, 107, 20, 0, 0}, {13368, 1573, 253, 0, 0},
+ {20016, 5910, 1728, 0, 0}, {24398, 10670, 4177, 0, 0},
+ {27311, 17395, 10470, 0, 0}, {1071, 62, 20, 0, 0},
+ {14908, 2111, 435, 0, 0}, {20258, 7956, 3507, 0, 0},
+ {26588, 13644, 8046, 0, 0}, {27727, 19220, 14809, 0, 0},
+ {1216, 52, 20, 0, 0}, {10860, 999, 145, 0, 0},
+ {18298, 4567, 1203, 0, 0}, {23275, 9786, 4160, 0, 0},
+ {25910, 15528, 8631, 0, 0}, {225, 16, 12, 0, 0},
+ {8482, 671, 102, 0, 0}, {16810, 3551, 744, 0, 0},
+ {22561, 8534, 2810, 0, 0}, {25839, 14463, 7116, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{28631, 21921, 17086, 0, 0}, {14944, 5767, 2710, 0, 0},
+ {22564, 9972, 4477, 0, 0}, {26692, 16833, 10643, 0, 0},
+ {28916, 21831, 15952, 0, 0}, {30516, 26444, 22637, 0, 0},
+ {6928, 752, 106, 0, 0}, {17659, 4500, 1237, 0, 0},
+ {23383, 10537, 4428, 0, 0}, {26686, 16096, 9289, 0, 0},
+ {29450, 23341, 18087, 0, 0}, {2174, 194, 50, 0, 0},
+ {15932, 3216, 909, 0, 0}, {23212, 10226, 4412, 0, 0},
+ {26463, 16043, 9228, 0, 0}, {29392, 22873, 17584, 0, 0},
+ {3385, 151, 23, 0, 0}, {13877, 1959, 367, 0, 0},
+ {21080, 6826, 2081, 0, 0}, {25300, 13299, 6117, 0, 0},
+ {28859, 21410, 15756, 0, 0}, {1204, 32, 20, 0, 0},
+ {11862, 1157, 168, 0, 0}, {19577, 5147, 1231, 0, 0},
+ {24000, 10739, 4092, 0, 0}, {27689, 18659, 11862, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseRangeCdf
+ [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount + 1] = {
+ {{{{18470, 12050, 8594, 0, 0}, {20232, 13167, 8979, 0, 0},
+ {24056, 17717, 13265, 0, 0}, {26598, 21441, 17334, 0, 0},
+ {28026, 23842, 20230, 0, 0}, {28965, 25451, 22222, 0, 0},
+ {31072, 29451, 27897, 0, 0}, {18376, 12817, 10012, 0, 0},
+ {16790, 9550, 5950, 0, 0}, {20581, 13294, 8879, 0, 0},
+ {23592, 17128, 12509, 0, 0}, {25700, 20113, 15740, 0, 0},
+ {27112, 22326, 18296, 0, 0}, {30188, 27776, 25524, 0, 0},
+ {20632, 14719, 11342, 0, 0}, {18984, 12047, 8287, 0, 0},
+ {21932, 15147, 10868, 0, 0}, {24396, 18324, 13921, 0, 0},
+ {26245, 20989, 16768, 0, 0}, {27431, 22870, 19008, 0, 0},
+ {29734, 26908, 24306, 0, 0}},
+ {{16801, 9863, 6482, 0, 0}, {19234, 12114, 8189, 0, 0},
+ {23264, 16676, 12233, 0, 0}, {25793, 20200, 15865, 0, 0},
+ {27404, 22677, 18748, 0, 0}, {28411, 24398, 20911, 0, 0},
+ {30262, 27834, 25550, 0, 0}, {9736, 3953, 1832, 0, 0},
+ {13228, 6064, 3049, 0, 0}, {17610, 9799, 5671, 0, 0},
+ {21360, 13903, 9118, 0, 0}, {23883, 17320, 12518, 0, 0},
+ {25660, 19915, 15352, 0, 0}, {28537, 24727, 21288, 0, 0},
+ {12945, 6278, 3612, 0, 0}, {13878, 6839, 3836, 0, 0},
+ {17108, 9277, 5335, 0, 0}, {20621, 12992, 8280, 0, 0},
+ {23040, 15994, 11119, 0, 0}, {24849, 18491, 13702, 0, 0},
+ {27328, 22598, 18583, 0, 0}}},
+ {{{18362, 11906, 8354, 0, 0}, {20944, 13861, 9659, 0, 0},
+ {24511, 18375, 13965, 0, 0}, {26908, 22021, 17990, 0, 0},
+ {28293, 24282, 20784, 0, 0}, {29162, 25814, 22725, 0, 0},
+ {31032, 29358, 27720, 0, 0}, {18338, 12722, 9886, 0, 0},
+ {17175, 9869, 6059, 0, 0}, {20666, 13400, 8957, 0, 0},
+ {23709, 17184, 12506, 0, 0}, {25769, 20165, 15720, 0, 0},
+ {27084, 22271, 18215, 0, 0}, {29946, 27330, 24906, 0, 0},
+ {16983, 11183, 8409, 0, 0}, {14421, 7539, 4502, 0, 0},
+ {17794, 10281, 6379, 0, 0}, {21345, 14087, 9497, 0, 0},
+ {23905, 17418, 12760, 0, 0}, {25615, 19916, 15490, 0, 0},
+ {29061, 25732, 22786, 0, 0}},
+ {{17308, 11072, 7299, 0, 0}, {20598, 13519, 9577, 0, 0},
+ {24045, 17741, 13436, 0, 0}, {26340, 21064, 16894, 0, 0},
+ {27846, 23476, 19716, 0, 0}, {28629, 25073, 21758, 0, 0},
+ {30477, 28260, 26170, 0, 0}, {12912, 5848, 2940, 0, 0},
+ {14845, 7479, 3976, 0, 0}, {18490, 10800, 6471, 0, 0},
+ {21858, 14632, 9818, 0, 0}, {24345, 17953, 13141, 0, 0},
+ {25997, 20485, 15994, 0, 0}, {28694, 25018, 21687, 0, 0},
+ {12916, 6694, 4096, 0, 0}, {13397, 6658, 3779, 0, 0},
+ {16503, 8895, 5105, 0, 0}, {20010, 12390, 7816, 0, 0},
+ {22673, 15670, 10807, 0, 0}, {24518, 18140, 13317, 0, 0},
+ {27563, 23023, 19146, 0, 0}}},
+ {{{22205, 16535, 13005, 0, 0}, {22974, 16746, 12964, 0, 0},
+ {26018, 20823, 17009, 0, 0}, {27805, 23582, 20016, 0, 0},
+ {28923, 25333, 22141, 0, 0}, {29717, 26683, 23934, 0, 0},
+ {31457, 30172, 28938, 0, 0}, {21522, 16364, 13079, 0, 0},
+ {20453, 13857, 10037, 0, 0}, {22211, 15673, 11479, 0, 0},
+ {24632, 18762, 14519, 0, 0}, {26420, 21294, 17203, 0, 0},
+ {27572, 23113, 19368, 0, 0}, {30419, 28242, 26181, 0, 0},
+ {19431, 14038, 11199, 0, 0}, {13462, 6697, 3886, 0, 0},
+ {16816, 9228, 5514, 0, 0}, {20359, 12834, 8338, 0, 0},
+ {23008, 16062, 11379, 0, 0}, {24764, 18548, 13950, 0, 0},
+ {28630, 24974, 21807, 0, 0}},
+ {{21898, 16084, 11819, 0, 0}, {23104, 17538, 14088, 0, 0},
+ {25882, 20659, 17360, 0, 0}, {27943, 23868, 20463, 0, 0},
+ {29138, 25606, 22454, 0, 0}, {29732, 26339, 23381, 0, 0},
+ {31097, 29472, 27828, 0, 0}, {18949, 13609, 9742, 0, 0},
+ {20784, 13660, 9648, 0, 0}, {22078, 15558, 11105, 0, 0},
+ {24784, 18614, 14435, 0, 0}, {25900, 20474, 16644, 0, 0},
+ {27494, 23774, 19900, 0, 0}, {29780, 26997, 24344, 0, 0},
+ {13032, 6121, 3627, 0, 0}, {13835, 6698, 3784, 0, 0},
+ {16989, 9720, 5568, 0, 0}, {20130, 12707, 8236, 0, 0},
+ {22076, 15223, 10548, 0, 0}, {23551, 17517, 12714, 0, 0},
+ {27690, 23484, 20174, 0, 0}}},
+ {{{30437, 29106, 27524, 0, 0}, {29877, 27997, 26623, 0, 0},
+ {28170, 25145, 23039, 0, 0}, {29248, 25923, 23569, 0, 0},
+ {29351, 26649, 23444, 0, 0}, {30167, 27356, 25383, 0, 0},
+ {32168, 31595, 31024, 0, 0}, {25096, 19482, 15299, 0, 0},
+ {28536, 24976, 21975, 0, 0}, {29853, 27451, 25371, 0, 0},
+ {30450, 28412, 26616, 0, 0}, {30641, 28768, 27214, 0, 0},
+ {30918, 29290, 27493, 0, 0}, {31791, 30835, 29925, 0, 0},
+ {14488, 8381, 4779, 0, 0}, {16916, 10097, 6583, 0, 0},
+ {18923, 11817, 7979, 0, 0}, {21713, 14802, 10639, 0, 0},
+ {23630, 17346, 12967, 0, 0}, {25314, 19623, 15312, 0, 0},
+ {29398, 26375, 23755, 0, 0}},
+ {{26926, 23539, 21930, 0, 0}, {30455, 29277, 28492, 0, 0},
+ {29770, 26664, 25272, 0, 0}, {30348, 25321, 22900, 0, 0},
+ {29734, 24273, 21845, 0, 0}, {28692, 23831, 21793, 0, 0},
+ {31682, 30398, 29469, 0, 0}, {23054, 15514, 12324, 0, 0},
+ {24225, 19070, 15645, 0, 0}, {27850, 23761, 20858, 0, 0},
+ {28639, 25236, 22215, 0, 0}, {30404, 27235, 24710, 0, 0},
+ {30934, 29222, 27205, 0, 0}, {31295, 29860, 28635, 0, 0},
+ {17363, 11575, 7149, 0, 0}, {17077, 10816, 6207, 0, 0},
+ {19806, 13574, 8603, 0, 0}, {22496, 14913, 10639, 0, 0},
+ {24180, 17498, 12050, 0, 0}, {24086, 18099, 13268, 0, 0},
+ {27898, 23132, 19563, 0, 0}}},
+ {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}}}},
+ {{{{17773, 11427, 8019, 0, 0}, {19610, 12479, 8167, 0, 0},
+ {23827, 17442, 12892, 0, 0}, {26471, 21227, 16961, 0, 0},
+ {27951, 23739, 19992, 0, 0}, {29037, 25495, 22141, 0, 0},
+ {30921, 29151, 27414, 0, 0}, {18296, 13109, 10425, 0, 0},
+ {15962, 8606, 5235, 0, 0}, {19868, 12364, 8055, 0, 0},
+ {23357, 16656, 11971, 0, 0}, {25712, 20071, 15620, 0, 0},
+ {27224, 22429, 18308, 0, 0}, {29814, 27064, 24449, 0, 0},
+ {20304, 14697, 11414, 0, 0}, {17286, 10240, 6734, 0, 0},
+ {20698, 13499, 9144, 0, 0}, {23815, 17362, 12662, 0, 0},
+ {25741, 20038, 15548, 0, 0}, {26881, 21855, 17628, 0, 0},
+ {28975, 25490, 22321, 0, 0}},
+ {{17197, 10536, 7019, 0, 0}, {18262, 11193, 7394, 0, 0},
+ {22579, 15679, 11199, 0, 0}, {25452, 19467, 14853, 0, 0},
+ {26985, 21856, 17578, 0, 0}, {28008, 23613, 19680, 0, 0},
+ {29775, 26802, 23994, 0, 0}, {9344, 3865, 1990, 0, 0},
+ {11993, 5102, 2478, 0, 0}, {16294, 8358, 4469, 0, 0},
+ {20297, 12588, 7781, 0, 0}, {23358, 16281, 11329, 0, 0},
+ {25232, 19154, 14239, 0, 0}, {27720, 23182, 19219, 0, 0},
+ {11678, 5478, 3012, 0, 0}, {11972, 5366, 2742, 0, 0},
+ {14949, 7283, 3799, 0, 0}, {18908, 10859, 6306, 0, 0},
+ {21766, 14274, 9239, 0, 0}, {23815, 16839, 11871, 0, 0},
+ {26320, 20850, 16314, 0, 0}}},
+ {{{16769, 10560, 7319, 0, 0}, {19718, 12780, 8646, 0, 0},
+ {24174, 17904, 13390, 0, 0}, {26735, 21689, 17530, 0, 0},
+ {28214, 24085, 20421, 0, 0}, {29096, 25629, 22431, 0, 0},
+ {30868, 28997, 27192, 0, 0}, {16980, 11428, 8819, 0, 0},
+ {15943, 8533, 5010, 0, 0}, {19895, 12366, 7958, 0, 0},
+ {23178, 16405, 11674, 0, 0}, {25416, 19559, 15035, 0, 0},
+ {26808, 21779, 17584, 0, 0}, {29536, 26534, 23761, 0, 0},
+ {17007, 12052, 9544, 0, 0}, {13450, 6779, 4009, 0, 0},
+ {17239, 9674, 5839, 0, 0}, {21106, 13779, 9127, 0, 0},
+ {23813, 17200, 12402, 0, 0}, {25487, 19662, 15060, 0, 0},
+ {28520, 24709, 21328, 0, 0}},
+ {{17869, 11551, 8265, 0, 0}, {19249, 12485, 8721, 0, 0},
+ {23339, 16802, 12403, 0, 0}, {26068, 20413, 16116, 0, 0},
+ {27680, 23064, 19052, 0, 0}, {28525, 24614, 21037, 0, 0},
+ {30066, 27404, 24907, 0, 0}, {10023, 4380, 2314, 0, 0},
+ {12533, 5622, 2846, 0, 0}, {16872, 9053, 5131, 0, 0},
+ {20928, 13418, 8637, 0, 0}, {23646, 16836, 11888, 0, 0},
+ {25280, 19187, 14406, 0, 0}, {27654, 23200, 19398, 0, 0},
+ {11923, 6215, 3836, 0, 0}, {11787, 5396, 2884, 0, 0},
+ {14987, 7433, 3983, 0, 0}, {19008, 11060, 6471, 0, 0},
+ {21793, 14353, 9403, 0, 0}, {23723, 16979, 12082, 0, 0},
+ {26638, 21569, 17345, 0, 0}}},
+ {{{19219, 13044, 9610, 0, 0}, {20924, 14386, 10522, 0, 0},
+ {24849, 19149, 14995, 0, 0}, {27282, 22625, 18822, 0, 0},
+ {28602, 24785, 21444, 0, 0}, {29404, 26262, 23341, 0, 0},
+ {31170, 29608, 28094, 0, 0}, {17487, 11789, 8987, 0, 0},
+ {17829, 10649, 6816, 0, 0}, {21405, 14361, 9956, 0, 0},
+ {24159, 17911, 13398, 0, 0}, {26031, 20584, 16288, 0, 0},
+ {27262, 22505, 18506, 0, 0}, {29778, 26982, 24388, 0, 0},
+ {12519, 7515, 5351, 0, 0}, {11698, 5250, 2767, 0, 0},
+ {15914, 8299, 4694, 0, 0}, {19904, 12282, 7768, 0, 0},
+ {22806, 15790, 10990, 0, 0}, {24694, 18430, 13720, 0, 0},
+ {28274, 24289, 20862, 0, 0}},
+ {{18808, 13151, 9939, 0, 0}, {21618, 15427, 11540, 0, 0},
+ {25618, 19804, 15578, 0, 0}, {27437, 22766, 18901, 0, 0},
+ {28601, 25024, 21711, 0, 0}, {29288, 26139, 23122, 0, 0},
+ {30885, 28984, 27082, 0, 0}, {14016, 7108, 3856, 0, 0},
+ {15800, 8182, 4738, 0, 0}, {19248, 11713, 7455, 0, 0},
+ {22315, 15142, 10488, 0, 0}, {24382, 18263, 13652, 0, 0},
+ {26026, 20173, 15760, 0, 0}, {28495, 24628, 21269, 0, 0},
+ {10648, 4941, 2535, 0, 0}, {12205, 5410, 2873, 0, 0},
+ {15692, 8124, 4615, 0, 0}, {19406, 11826, 7459, 0, 0},
+ {21974, 14803, 10073, 0, 0}, {23754, 17116, 12449, 0, 0},
+ {27060, 22256, 18271, 0, 0}}},
+ {{{27063, 21838, 17043, 0, 0}, {24822, 20003, 16653, 0, 0},
+ {25967, 20645, 16542, 0, 0}, {27306, 22633, 18568, 0, 0},
+ {28579, 24757, 21261, 0, 0}, {29577, 26539, 23360, 0, 0},
+ {31711, 30631, 29556, 0, 0}, {22750, 15701, 11277, 0, 0},
+ {25388, 20186, 16315, 0, 0}, {26700, 21923, 18429, 0, 0},
+ {27670, 23570, 20213, 0, 0}, {28456, 24758, 21649, 0, 0},
+ {29068, 25802, 22987, 0, 0}, {31075, 29442, 27881, 0, 0},
+ {14011, 7838, 4994, 0, 0}, {15120, 8172, 4951, 0, 0},
+ {18061, 10716, 6742, 0, 0}, {21048, 13916, 9476, 0, 0},
+ {23411, 16816, 12243, 0, 0}, {24958, 19015, 14558, 0, 0},
+ {28889, 25435, 22440, 0, 0}},
+ {{24490, 19526, 16846, 0, 0}, {22221, 16901, 13849, 0, 0},
+ {23662, 16926, 12159, 0, 0}, {25935, 19761, 15550, 0, 0},
+ {27957, 23056, 18845, 0, 0}, {28783, 25416, 21640, 0, 0},
+ {31080, 29310, 27506, 0, 0}, {19817, 10907, 6258, 0, 0},
+ {22980, 16724, 12492, 0, 0}, {26459, 21524, 17898, 0, 0},
+ {27585, 23419, 20202, 0, 0}, {28379, 24539, 21276, 0, 0},
+ {29135, 25823, 22148, 0, 0}, {29168, 25921, 22861, 0, 0},
+ {11020, 4631, 2513, 0, 0}, {13332, 6187, 3208, 0, 0},
+ {16409, 8567, 4815, 0, 0}, {18807, 11075, 6897, 0, 0},
+ {21224, 14082, 9446, 0, 0}, {23396, 16306, 11816, 0, 0},
+ {26630, 21558, 17378, 0, 0}}},
+ {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}}}},
+ {{{{16630, 10545, 7259, 0, 0}, {17421, 10338, 6436, 0, 0},
+ {23154, 16032, 11436, 0, 0}, {26168, 20493, 15861, 0, 0},
+ {27957, 23344, 19221, 0, 0}, {29020, 24959, 21348, 0, 0},
+ {30514, 28181, 25878, 0, 0}, {17572, 12484, 9591, 0, 0},
+ {14451, 7299, 4317, 0, 0}, {18850, 11117, 6926, 0, 0},
+ {22716, 15618, 10773, 0, 0}, {25269, 19138, 14181, 0, 0},
+ {26610, 21351, 16765, 0, 0}, {28754, 24983, 21516, 0, 0},
+ {17720, 11701, 8384, 0, 0}, {14566, 7422, 4215, 0, 0},
+ {18466, 10749, 6412, 0, 0}, {21929, 14629, 9602, 0, 0},
+ {24053, 17024, 11962, 0, 0}, {25232, 19192, 14224, 0, 0},
+ {27355, 22433, 18270, 0, 0}},
+ {{15374, 8267, 4873, 0, 0}, {16879, 9348, 5583, 0, 0},
+ {21207, 13635, 8898, 0, 0}, {24483, 17956, 12924, 0, 0},
+ {26272, 20725, 16218, 0, 0}, {27997, 23194, 19091, 0, 0},
+ {29165, 25938, 22624, 0, 0}, {11112, 5064, 2568, 0, 0},
+ {11444, 4853, 2257, 0, 0}, {15441, 7432, 3771, 0, 0},
+ {19351, 11387, 6735, 0, 0}, {22636, 15343, 10430, 0, 0},
+ {24188, 17752, 13135, 0, 0}, {27074, 21291, 16357, 0, 0},
+ {8652, 2988, 1318, 0, 0}, {8915, 3073, 1177, 0, 0},
+ {12683, 5154, 2340, 0, 0}, {17442, 8433, 4193, 0, 0},
+ {20954, 13296, 7958, 0, 0}, {22547, 14157, 8001, 0, 0},
+ {25079, 18210, 12447, 0, 0}}},
+ {{{16554, 10388, 6998, 0, 0}, {18555, 11464, 7473, 0, 0},
+ {23555, 16945, 12313, 0, 0}, {26373, 21010, 16629, 0, 0},
+ {27989, 23581, 19702, 0, 0}, {28947, 25267, 21815, 0, 0},
+ {30475, 28201, 25973, 0, 0}, {16909, 11485, 8948, 0, 0},
+ {14364, 7166, 4042, 0, 0}, {18443, 10788, 6562, 0, 0},
+ {22099, 14831, 10048, 0, 0}, {24471, 18126, 13321, 0, 0},
+ {26022, 20379, 15875, 0, 0}, {28444, 24517, 20998, 0, 0},
+ {16236, 11137, 8293, 0, 0}, {12101, 5618, 3100, 0, 0},
+ {16040, 8258, 4593, 0, 0}, {19907, 12123, 7436, 0, 0},
+ {22692, 15407, 10351, 0, 0}, {24373, 17828, 12805, 0, 0},
+ {27037, 22085, 17856, 0, 0}},
+ {{18335, 11613, 7830, 0, 0}, {18110, 11052, 7223, 0, 0},
+ {22845, 15944, 11211, 0, 0}, {25786, 19716, 15047, 0, 0},
+ {27349, 22265, 17718, 0, 0}, {27916, 23606, 19754, 0, 0},
+ {29497, 26373, 23138, 0, 0}, {10558, 4935, 2659, 0, 0},
+ {12018, 5400, 2947, 0, 0}, {15874, 7940, 4195, 0, 0},
+ {19521, 11492, 7011, 0, 0}, {22730, 15503, 10205, 0, 0},
+ {24181, 17821, 12441, 0, 0}, {27123, 21397, 17516, 0, 0},
+ {10741, 5242, 3054, 0, 0}, {9670, 3622, 1547, 0, 0},
+ {12882, 5427, 2496, 0, 0}, {17159, 9021, 4722, 0, 0},
+ {20775, 12703, 7829, 0, 0}, {23131, 14501, 9097, 0, 0},
+ {25143, 18967, 13624, 0, 0}}},
+ {{{18330, 11970, 8679, 0, 0}, {20147, 13565, 9671, 0, 0},
+ {24591, 18643, 14366, 0, 0}, {27094, 22267, 18312, 0, 0},
+ {28532, 24529, 21035, 0, 0}, {29321, 26018, 22962, 0, 0},
+ {30782, 28818, 26904, 0, 0}, {16560, 10669, 7838, 0, 0},
+ {16231, 8743, 5183, 0, 0}, {19988, 12387, 7901, 0, 0},
+ {23001, 16156, 11352, 0, 0}, {25082, 19030, 14370, 0, 0},
+ {26435, 21154, 16804, 0, 0}, {28827, 25197, 21932, 0, 0},
+ {9949, 5346, 3566, 0, 0}, {10544, 4254, 2047, 0, 0},
+ {15108, 7335, 3855, 0, 0}, {19194, 11286, 6766, 0, 0},
+ {22139, 14791, 9830, 0, 0}, {24156, 17470, 12503, 0, 0},
+ {27161, 22277, 18172, 0, 0}},
+ {{19199, 12968, 9562, 0, 0}, {19640, 12844, 8899, 0, 0},
+ {24439, 17927, 13365, 0, 0}, {26638, 21792, 17711, 0, 0},
+ {28086, 23929, 20250, 0, 0}, {29112, 25359, 22180, 0, 0},
+ {30191, 27669, 25356, 0, 0}, {10341, 4084, 2183, 0, 0},
+ {11855, 5018, 2629, 0, 0}, {16928, 8659, 4934, 0, 0},
+ {20460, 12739, 8199, 0, 0}, {22552, 15983, 11310, 0, 0},
+ {24459, 18565, 13655, 0, 0}, {26725, 21600, 17461, 0, 0},
+ {9602, 3867, 1770, 0, 0}, {10869, 4363, 2017, 0, 0},
+ {14355, 6677, 3325, 0, 0}, {17535, 9654, 5416, 0, 0},
+ {20085, 12296, 7480, 0, 0}, {22066, 14509, 9359, 0, 0},
+ {24643, 18304, 13542, 0, 0}}},
+ {{{23728, 17982, 14408, 0, 0}, {22789, 17050, 13353, 0, 0},
+ {24855, 18850, 14457, 0, 0}, {26909, 21879, 17584, 0, 0},
+ {28175, 24091, 20258, 0, 0}, {28948, 25372, 21977, 0, 0},
+ {31038, 29297, 27576, 0, 0}, {20965, 14403, 10059, 0, 0},
+ {21349, 14710, 10543, 0, 0}, {23350, 16994, 12525, 0, 0},
+ {25229, 19443, 15111, 0, 0}, {26535, 21451, 17384, 0, 0},
+ {27631, 23112, 19223, 0, 0}, {29791, 26994, 24419, 0, 0},
+ {11561, 5522, 3128, 0, 0}, {13221, 6190, 3271, 0, 0},
+ {16599, 8897, 5078, 0, 0}, {19948, 12310, 7750, 0, 0},
+ {22544, 15436, 10554, 0, 0}, {24242, 17720, 12884, 0, 0},
+ {27731, 23358, 19650, 0, 0}},
+ {{20429, 15439, 12628, 0, 0}, {19263, 12873, 9543, 0, 0},
+ {22921, 15824, 11204, 0, 0}, {25488, 19512, 14420, 0, 0},
+ {28056, 22759, 18314, 0, 0}, {28407, 24854, 20291, 0, 0},
+ {29898, 27140, 24773, 0, 0}, {12707, 7264, 4242, 0, 0},
+ {17533, 9890, 6623, 0, 0}, {19783, 12810, 8613, 0, 0},
+ {22986, 16127, 11365, 0, 0}, {23312, 16408, 12008, 0, 0},
+ {25913, 19828, 14211, 0, 0}, {27107, 22204, 17766, 0, 0},
+ {7112, 2166, 874, 0, 0}, {10198, 3661, 1676, 0, 0},
+ {13851, 6345, 3227, 0, 0}, {16828, 9119, 5014, 0, 0},
+ {19965, 12187, 7549, 0, 0}, {21686, 14073, 9392, 0, 0},
+ {24829, 18395, 13763, 0, 0}}},
+ {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}}}},
+ {{{{14453, 8479, 5217, 0, 0}, {15914, 8700, 4933, 0, 0},
+ {22628, 14841, 9595, 0, 0}, {26046, 19786, 14501, 0, 0},
+ {28107, 22942, 18062, 0, 0}, {28936, 24603, 20474, 0, 0},
+ {29973, 26670, 23523, 0, 0}, {15623, 9442, 6096, 0, 0},
+ {12035, 5088, 2460, 0, 0}, {16736, 8307, 4222, 0, 0},
+ {21115, 12675, 7687, 0, 0}, {23478, 16339, 10682, 0, 0},
+ {24972, 18170, 12786, 0, 0}, {26266, 20390, 15327, 0, 0},
+ {11087, 5036, 2448, 0, 0}, {10379, 3724, 1507, 0, 0},
+ {13741, 6037, 2681, 0, 0}, {18029, 9013, 4144, 0, 0},
+ {21410, 11990, 7257, 0, 0}, {21773, 14695, 8578, 0, 0},
+ {23606, 17778, 12151, 0, 0}},
+ {{11343, 4816, 2380, 0, 0}, {14706, 6930, 3734, 0, 0},
+ {20812, 12887, 7960, 0, 0}, {25050, 17768, 11788, 0, 0},
+ {27066, 21514, 16625, 0, 0}, {27870, 23680, 15904, 0, 0},
+ {29089, 25992, 20861, 0, 0}, {9474, 2608, 1105, 0, 0},
+ {8371, 2872, 932, 0, 0}, {13523, 5640, 2175, 0, 0},
+ {19566, 12943, 6364, 0, 0}, {21190, 13471, 8811, 0, 0},
+ {24695, 19471, 11398, 0, 0}, {27307, 21845, 13023, 0, 0},
+ {5401, 2247, 834, 0, 0}, {7864, 2097, 828, 0, 0},
+ {9693, 4308, 1469, 0, 0}, {18368, 9110, 2351, 0, 0},
+ {18883, 8886, 4443, 0, 0}, {18022, 9830, 4915, 0, 0},
+ {27307, 16384, 5461, 0, 0}}},
+ {{{14494, 7955, 4878, 0, 0}, {17231, 9619, 5765, 0, 0},
+ {23319, 16028, 10941, 0, 0}, {26068, 20270, 15507, 0, 0},
+ {27780, 22902, 18570, 0, 0}, {28532, 24621, 20866, 0, 0},
+ {29901, 26908, 24114, 0, 0}, {15644, 9597, 6667, 0, 0},
+ {12372, 5291, 2620, 0, 0}, {16195, 8139, 4276, 0, 0},
+ {20019, 11922, 7094, 0, 0}, {22535, 14890, 9950, 0, 0},
+ {24243, 17436, 12405, 0, 0}, {26485, 21136, 16513, 0, 0},
+ {12302, 6257, 3482, 0, 0}, {9709, 3594, 1577, 0, 0},
+ {13287, 5505, 2527, 0, 0}, {17310, 9137, 4631, 0, 0},
+ {20352, 12160, 7075, 0, 0}, {22507, 14757, 9507, 0, 0},
+ {24752, 18113, 13102, 0, 0}},
+ {{15152, 8182, 4656, 0, 0}, {16959, 9469, 5613, 0, 0},
+ {22001, 13878, 8975, 0, 0}, {25041, 18513, 13903, 0, 0},
+ {26639, 20842, 15886, 0, 0}, {28286, 23064, 17907, 0, 0},
+ {29491, 25316, 21246, 0, 0}, {9812, 4217, 2038, 0, 0},
+ {10044, 3831, 1807, 0, 0}, {14301, 6444, 3188, 0, 0},
+ {19534, 12055, 7119, 0, 0}, {21587, 15176, 10287, 0, 0},
+ {24477, 14410, 8192, 0, 0}, {25200, 20887, 17784, 0, 0},
+ {7820, 3767, 1621, 0, 0}, {7094, 2149, 617, 0, 0},
+ {11927, 5975, 3165, 0, 0}, {18099, 8412, 4102, 0, 0},
+ {21434, 9175, 4549, 0, 0}, {23846, 18006, 9895, 0, 0},
+ {24467, 19224, 12233, 0, 0}}},
+ {{{15655, 9035, 5687, 0, 0}, {18629, 11362, 7316, 0, 0},
+ {24216, 17766, 12992, 0, 0}, {26897, 21648, 17390, 0, 0},
+ {28313, 24152, 20515, 0, 0}, {29299, 25858, 22382, 0, 0},
+ {30513, 28215, 25986, 0, 0}, {14544, 8392, 5715, 0, 0},
+ {13478, 6058, 3154, 0, 0}, {17832, 9777, 5584, 0, 0},
+ {21530, 13817, 9006, 0, 0}, {23982, 17151, 12180, 0, 0},
+ {25451, 19540, 14765, 0, 0}, {27667, 23256, 19275, 0, 0},
+ {10129, 4546, 2558, 0, 0}, {9552, 3437, 1461, 0, 0},
+ {13693, 6006, 2873, 0, 0}, {17754, 9655, 5311, 0, 0},
+ {20830, 12911, 8016, 0, 0}, {22826, 15488, 10486, 0, 0},
+ {25601, 19624, 15016, 0, 0}},
+ {{16948, 10030, 6280, 0, 0}, {19238, 11883, 7552, 0, 0},
+ {24373, 17238, 12316, 0, 0}, {26194, 20447, 16388, 0, 0},
+ {27415, 22349, 18200, 0, 0}, {28155, 24322, 20387, 0, 0},
+ {29328, 25610, 22865, 0, 0}, {8521, 3717, 1544, 0, 0},
+ {10650, 4710, 2399, 0, 0}, {16270, 8000, 4379, 0, 0},
+ {19848, 11593, 6631, 0, 0}, {22038, 14149, 7416, 0, 0},
+ {22581, 16489, 9977, 0, 0}, {23458, 18137, 10641, 0, 0},
+ {7798, 2210, 711, 0, 0}, {7967, 2826, 1070, 0, 0},
+ {10336, 4315, 1913, 0, 0}, {13714, 7088, 3188, 0, 0},
+ {18376, 9732, 4659, 0, 0}, {20273, 11821, 6118, 0, 0},
+ {20326, 12442, 6554, 0, 0}}},
+ {{{20606, 13983, 10120, 0, 0}, {20019, 13071, 8962, 0, 0},
+ {24188, 17471, 12422, 0, 0}, {26599, 21019, 16225, 0, 0},
+ {27932, 23377, 19320, 0, 0}, {28947, 25057, 21155, 0, 0},
+ {30540, 28167, 25698, 0, 0}, {16449, 8043, 4488, 0, 0},
+ {17070, 9491, 5600, 0, 0}, {20042, 12400, 7721, 0, 0},
+ {22856, 15753, 10792, 0, 0}, {24880, 18548, 13589, 0, 0},
+ {25991, 20484, 15750, 0, 0}, {28276, 24178, 20516, 0, 0},
+ {9519, 3864, 1821, 0, 0}, {11718, 4860, 2256, 0, 0},
+ {15328, 7428, 3819, 0, 0}, {18709, 10750, 6227, 0, 0},
+ {21480, 13865, 8870, 0, 0}, {23357, 16426, 11340, 0, 0},
+ {26490, 21180, 16824, 0, 0}},
+ {{18787, 12701, 9542, 0, 0}, {15846, 9188, 5985, 0, 0},
+ {21763, 13729, 8281, 0, 0}, {25379, 18550, 12970, 0, 0},
+ {27170, 21263, 15562, 0, 0}, {26678, 21555, 17109, 0, 0},
+ {28948, 25397, 22649, 0, 0}, {11686, 5843, 3093, 0, 0},
+ {11506, 4141, 1640, 0, 0}, {14376, 6314, 2331, 0, 0},
+ {17898, 9858, 5672, 0, 0}, {20148, 13284, 7860, 0, 0},
+ {23478, 16215, 9966, 0, 0}, {26100, 18480, 12764, 0, 0},
+ {5064, 1713, 819, 0, 0}, {8059, 2790, 980, 0, 0},
+ {11100, 3504, 1111, 0, 0}, {14473, 5800, 2694, 0, 0},
+ {16369, 8346, 3455, 0, 0}, {18421, 9742, 4664, 0, 0},
+ {20398, 12962, 8291, 0, 0}}},
+ {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}}}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kDcSignContexts][kBooleanFieldCdfSize] = {
+ {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+ {15488, 0, 0}}},
+ {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+ {15488, 0, 0}}},
+ {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+ {15488, 0, 0}}},
+ {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+ {15488, 0, 0}}}
+};
+/* clang-format on */
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] = {23355, 10187,
+ 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultHasPaletteYCdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+ [kBooleanFieldCdfSize] = {
+ {{1092, 0, 0}, {29349, 0, 0}, {31507, 0, 0}},
+ {{856, 0, 0}, {29909, 0, 0}, {31788, 0, 0}},
+ {{945, 0, 0}, {29368, 0, 0}, {31987, 0, 0}},
+ {{738, 0, 0}, {29207, 0, 0}, {31864, 0, 0}},
+ {{459, 0, 0}, {25431, 0, 0}, {31306, 0, 0}},
+ {{503, 0, 0}, {28753, 0, 0}, {31247, 0, 0}},
+ {{318, 0, 0}, {24822, 0, 0}, {32639, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultPaletteYSizeCdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1] = {
+ {24816, 19768, 14619, 11290, 7241, 3527, 0, 0},
+ {25629, 21347, 16573, 13224, 9102, 4695, 0, 0},
+ {24980, 20027, 15443, 12268, 8453, 4238, 0, 0},
+ {24497, 18704, 14522, 11204, 7697, 4235, 0, 0},
+ {20043, 13588, 10905, 7929, 5233, 2648, 0, 0},
+ {23057, 17880, 15845, 11716, 7107, 4893, 0, 0},
+ {17828, 11971, 11090, 8582, 5735, 3769, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultHasPaletteUVCdf[kPaletteUVModeContexts][kBooleanFieldCdfSize] = {
+ {307, 0, 0}, {11280, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultPaletteUVSizeCdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1] = {
+ {24055, 12789, 5640, 3159, 1437, 496, 0, 0},
+ {26929, 17195, 9187, 5821, 2920, 1068, 0, 0},
+ {28342, 21508, 14769, 11285, 6905, 3338, 0, 0},
+ {29540, 23304, 17775, 14679, 10245, 5348, 0, 0},
+ {29000, 23882, 19677, 14916, 10273, 5561, 0, 0},
+ {30304, 24317, 19907, 11136, 7243, 4213, 0, 0},
+ {31499, 27333, 22335, 13805, 11068, 6903, 0,
+ 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPaletteColorIndexCdf
+ [kNumPlaneTypes][kPaletteSizeSymbolCount][kPaletteColorIndexContexts]
+ [kPaletteColorIndexSymbolCount + 1] = {
+ {{{4058, 0, 0},
+ {16384, 0, 0},
+ {22215, 0, 0},
+ {5732, 0, 0},
+ {1165, 0, 0}},
+ {{4891, 2278, 0, 0},
+ {21236, 7071, 0, 0},
+ {26224, 2534, 0, 0},
+ {9750, 4696, 0, 0},
+ {853, 383, 0, 0}},
+ {{7196, 4722, 2723, 0, 0},
+ {23290, 11178, 5512, 0, 0},
+ {25520, 5931, 2944, 0, 0},
+ {13601, 8282, 4419, 0, 0},
+ {1368, 943, 518, 0, 0}},
+ {{7989, 5813, 4192, 2486, 0, 0},
+ {24099, 12404, 8695, 4675, 0, 0},
+ {28513, 5203, 3391, 1701, 0, 0},
+ {12904, 9094, 6052, 3238, 0, 0},
+ {1122, 875, 621, 342, 0, 0}},
+ {{9636, 7361, 5798, 4333, 2695, 0, 0},
+ {25325, 15526, 12051, 8006, 4786, 0, 0},
+ {26468, 7906, 5824, 3984, 2097, 0, 0},
+ {13852, 9873, 7501, 5333, 3116, 0, 0},
+ {1498, 1218, 960, 709, 415, 0, 0}},
+ {{9663, 7569, 6304, 5084, 3837, 2450, 0, 0},
+ {25818, 17321, 13816, 10087, 7201, 4205, 0, 0},
+ {25208, 9294, 7278, 5565, 3847, 2060, 0, 0},
+ {14224, 10395, 8311, 6573, 4649, 2723, 0, 0},
+ {1570, 1317, 1098, 886, 645, 377, 0, 0}},
+ {{11079, 8885, 7605, 6416, 5262, 3941, 2573, 0, 0},
+ {25876, 17383, 14928, 11162, 8481, 6015, 3564, 0, 0},
+ {27117, 9586, 7726, 6250, 4786, 3376, 1868, 0, 0},
+ {13419, 10190, 8350, 6774, 5244, 3737, 2320, 0, 0},
+ {1740, 1498, 1264, 1063, 841, 615, 376, 0, 0}}},
+ {{{3679, 0, 0},
+ {16384, 0, 0},
+ {24055, 0, 0},
+ {3511, 0, 0},
+ {1158, 0, 0}},
+ {{7511, 3623, 0, 0},
+ {20481, 5475, 0, 0},
+ {25735, 4808, 0, 0},
+ {12623, 7363, 0, 0},
+ {2160, 1129, 0, 0}},
+ {{8558, 5593, 2865, 0, 0},
+ {22880, 10382, 5554, 0, 0},
+ {26867, 6715, 3475, 0, 0},
+ {14450, 10616, 4435, 0, 0},
+ {2309, 1632, 842, 0, 0}},
+ {{9788, 7289, 4987, 2782, 0, 0},
+ {24355, 11360, 7909, 3894, 0, 0},
+ {30511, 3319, 2174, 1170, 0, 0},
+ {13579, 11566, 6853, 4148, 0, 0},
+ {924, 724, 487, 250, 0, 0}},
+ {{10551, 8201, 6131, 4085, 2220, 0, 0},
+ {25461, 16362, 13132, 8136, 4344, 0, 0},
+ {28327, 7704, 5889, 3826, 1849, 0, 0},
+ {15558, 12240, 9449, 6018, 3186, 0, 0},
+ {2094, 1815, 1372, 1033, 561, 0, 0}},
+ {{11529, 9600, 7724, 5806, 4063, 2262, 0, 0},
+ {26223, 17756, 14764, 10951, 7265, 4067, 0, 0},
+ {29320, 6473, 5331, 4064, 2642, 1326, 0, 0},
+ {16879, 14445, 11064, 8070, 5792, 3078, 0, 0},
+ {1780, 1564, 1289, 1034, 785, 443, 0, 0}},
+ {{11326, 9480, 8010, 6522, 5119, 3788, 2205, 0, 0},
+ {26905, 17835, 15216, 12100, 9085, 6357, 3495, 0, 0},
+ {29353, 6958, 5891, 4778, 3545, 2374, 1150, 0, 0},
+ {14803, 12684, 10536, 8794, 6494, 4366, 2378, 0, 0},
+ {1578, 1439, 1252, 1089, 943, 742, 446, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] = {
+ {31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseCompoundReferenceCdf[kUseCompoundReferenceContexts]
+ [kBooleanFieldCdfSize] = {{5940, 0, 0},
+ {8733, 0, 0},
+ {20737, 0, 0},
+ {22128, 0, 0},
+ {29867, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCompoundReferenceTypeCdf[kCompoundReferenceTypeContexts]
+ [kBooleanFieldCdfSize] = {{31570, 0, 0},
+ {30698, 0, 0},
+ {23602, 0, 0},
+ {25269, 0, 0},
+ {10293, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundReferenceCdf
+ [kNumCompoundReferenceTypes][kReferenceContexts][3][kBooleanFieldCdfSize] =
+ {{{{27484, 0, 0}, {28903, 0, 0}, {29640, 0, 0}},
+ {{9616, 0, 0}, {18595, 0, 0}, {17498, 0, 0}},
+ {{994, 0, 0}, {7648, 0, 0}, {6058, 0, 0}}},
+ {{{27822, 0, 0}, {23300, 0, 0}, {31265, 0, 0}},
+ {{12877, 0, 0}, {10327, 0, 0}, {17608, 0, 0}},
+ {{2037, 0, 0}, {1709, 0, 0}, {5224, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCompoundBackwardReferenceCdf[kReferenceContexts][2]
+ [kBooleanFieldCdfSize] = {
+ {{30533, 0, 0}, {31345, 0, 0}},
+ {{15586, 0, 0}, {17593, 0, 0}},
+ {{2162, 0, 0}, {2279, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
+ [kBooleanFieldCdfSize] = {
+ {{27871, 0, 0}, {31213, 0, 0}, {28532, 0, 0}, {24118, 0, 0}, {31864, 0, 0},
+ {31324, 0, 0}},
+ {{15795, 0, 0}, {16017, 0, 0}, {13121, 0, 0}, {7995, 0, 0}, {21754, 0, 0},
+ {17681, 0, 0}},
+ {{3024, 0, 0}, {2489, 0, 0}, {1574, 0, 0}, {873, 0, 0}, {5893, 0, 0},
+ {2464, 0, 0}}};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundPredictionModeCdf
+ [kCompoundPredictionModeContexts][kNumCompoundInterPredictionModes + 1] = {
+ {25008, 18945, 16960, 15127, 13612, 12102, 5877, 0, 0},
+ {22038, 13316, 11623, 10019, 8729, 7637, 4044, 0, 0},
+ {22104, 12547, 11180, 9862, 8473, 7381, 4332, 0, 0},
+ {19470, 15784, 12297, 8586, 7701, 7032, 6346, 0, 0},
+ {13864, 9443, 7526, 5336, 4870, 4510, 2010, 0, 0},
+ {22043, 15314, 12644, 9948, 8573, 7600, 6722, 0, 0},
+ {15643, 8495, 6954, 5276, 4554, 4064, 2176, 0, 0},
+ {19722, 9554, 8263, 6826, 5333, 4326, 3438, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
+ {8733, 0, 0}, {16138, 0, 0}, {17429, 0, 0},
+ {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {{30593, 0, 0},
+ {31714, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultReferenceMvCdf[kReferenceMvContexts][kBooleanFieldCdfSize] = {
+ {8794, 0, 0}, {8580, 0, 0}, {14920, 0, 0},
+ {4146, 0, 0}, {8456, 0, 0}, {12845, 0, 0}};
+
+// This is called drl_mode in the spec where DRL stands for Dynamic Reference
+// List.
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultRefMvIndexCdf[kRefMvIndexContexts][kBooleanFieldCdfSize] = {
+ {19664, 0, 0}, {8208, 0, 0}, {13823, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsInterIntraCdf[kInterIntraContexts][kBooleanFieldCdfSize] = {
+ {5881, 0, 0}, {5171, 0, 0}, {2531, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultInterIntraModeCdf[kInterIntraContexts][kNumInterIntraModes + 1] = {
+ {30893, 21686, 5436, 0, 0},
+ {30295, 22772, 6380, 0, 0},
+ {28530, 21231, 6842, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsWedgeInterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {12732, 0, 0}, {7811, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {6064, 0, 0}, {5238, 0, 0}, {3204, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {3324, 0, 0}, {5896, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultWedgeIndexCdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1] = {
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30330, 28328, 26169, 24105, 21763, 19894, 17017, 14674, 12409, 10406,
+ 8641, 7066, 5016, 3318, 1597, 0, 0},
+ {31962, 29502, 26763, 26030, 25550, 25401, 24997, 18180, 16445, 15401,
+ 14316, 13346, 9929, 6641, 3139, 0, 0},
+ {32614, 31781, 30843, 30717, 30680, 30657, 30617, 9735, 9065, 8484,
+ 7783, 7084, 5509, 3885, 1857, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {29989, 29030, 28085, 25555, 24993, 24751, 24113, 18411, 14829, 11436,
+ 8248, 5298, 3312, 2239, 1112, 0, 0},
+ {31084, 29143, 27093, 25660, 23466, 21494, 18339, 15624, 13605, 11807,
+ 9884, 8297, 6049, 4054, 1891, 0, 0},
+ {31626, 29277, 26491, 25454, 24679, 24413, 23745, 19144, 17399, 16038,
+ 14654, 13455, 10247, 6756, 3218, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {31633, 31446, 31275, 30133, 30072, 30031, 29998, 11752, 9833, 7711,
+ 5517, 3595, 2679, 1808, 835, 0, 0},
+ {30026, 28573, 27041, 24733, 23788, 23432, 22622, 18644, 15498, 12235,
+ 9334, 6796, 4824, 3198, 1352, 0, 0},
+ {31041, 28820, 26667, 24972, 22927, 20424, 17002, 13824, 12130, 10730,
+ 8805, 7457, 5780, 4002, 1756, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {22331, 0, 0}, {23397, 0, 0}, {9104, 0, 0}, {16384, 0, 0},
+ {23467, 0, 0}, {15336, 0, 0}, {18345, 0, 0}, {8760, 0, 0},
+ {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0}, {9945, 0, 0},
+ {5889, 0, 0}, {10685, 0, 0}, {2640, 0, 0}, {1754, 0, 0},
+ {1208, 0, 0}, {130, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] = {
+ {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}, {25117, 8008, 0, 0}, {28030, 8003, 0, 0},
+ {3969, 1378, 0, 0}, {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
+ {13349, 5958, 0, 0}, {27645, 9162, 0, 0}, {3795, 1174, 0, 0},
+ {6337, 1994, 0, 0}, {21162, 8460, 0, 0}, {6508, 3652, 0, 0},
+ {12408, 4706, 0, 0}, {3026, 1565, 0, 0}, {11089, 5938, 0, 0},
+ {3252, 2067, 0, 0}, {3870, 2371, 0, 0}, {1890, 1433, 0, 0},
+ {261, 210, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsExplicitCompoundTypeCdf[kIsExplicitCompoundTypeContexts]
+ [kBooleanFieldCdfSize] = {
+ {6161, 0, 0}, {9877, 0, 0},
+ {13928, 0, 0}, {8174, 0, 0},
+ {12834, 0, 0}, {10094, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsCompoundTypeAverageCdf[kIsCompoundTypeAverageContexts]
+ [kBooleanFieldCdfSize] = {
+ {14524, 0, 0}, {19903, 0, 0},
+ {25715, 0, 0}, {19509, 0, 0},
+ {23434, 0, 0}, {28124, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCompoundTypeCdf[kMaxBlockSizes]
+ [kNumExplicitCompoundPredictionTypes + 1] = {
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {9337, 0, 0}, {19597, 0, 0},
+ {20948, 0, 0}, {16384, 0, 0}, {21298, 0, 0},
+ {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
+ {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultInterpolationFilterCdf
+ [kInterpolationFilterContexts][kNumExplicitInterpolationFilters + 1] = {
+ {833, 48, 0, 0}, {27200, 49, 0, 0}, {32346, 29830, 0, 0},
+ {4524, 160, 0, 0}, {1562, 815, 0, 0}, {27906, 647, 0, 0},
+ {31998, 31616, 0, 0}, {11879, 7131, 0, 0}, {858, 44, 0, 0},
+ {28648, 56, 0, 0}, {32463, 30521, 0, 0}, {5365, 132, 0, 0},
+ {1746, 759, 0, 0}, {29805, 675, 0, 0}, {32167, 31825, 0, 0},
+ {17799, 11370, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504, 13440, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
+ 4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0FractionCdf[kBooleanSymbolCount][kMvFractionSymbolCount +
+ 1] = {
+ {16384, 8192, 6144, 0, 0}, {20480, 11520, 8640, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {12288, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
+ {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0},
+ {10240, 0, 0}, {8192, 0, 0}, {4096, 0, 0}, {2816, 0, 0},
+ {2816, 0, 0}, {2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {24576, 15360, 11520, 0,
+ 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc
new file mode 100644
index 0000000..cd4d576
--- /dev/null
+++ b/src/threading_strategy.cc
@@ -0,0 +1,222 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/threading_strategy.h"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+namespace {
+
+#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
+constexpr int kFrameParallelThresholdMultiplier = 3;
+#else
+constexpr int kFrameParallelThresholdMultiplier =
+ LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
+#endif
+
+// Computes the number of frame threads to be used based on the following
+// heuristic:
+// * If |thread_count| == 1, return 0.
+// * If |thread_count| <= |tile_count| * 4, return 0.
+// * Otherwise, return the largest value of i which satisfies the following
+// condition: i + i * tile_columns <= thread_count. This ensures that there
+// are at least |tile_columns| worker threads for each frame thread.
+// * This function will never return 1 or a value > |thread_count|.
+//
+// This heuristic is based empirical performance data. The in-frame threading
+// model (combination of tile multithreading, superblock row multithreading and
+// post filter multithreading) performs better than the frame parallel model
+// until we reach the threshold of |thread_count| > |tile_count| *
+// kFrameParallelThresholdMultiplier.
+//
+// It is a function of |tile_count| since tile threading and superblock row
+// multithreading will scale only as a factor of |tile_count|. The threshold 4
+// is arrived at based on empirical data. The general idea is that superblock
+// row multithreading plateaus at 4 * |tile_count| because in most practical
+// cases there aren't more than that many superblock rows and columns available
+// to work on in parallel.
+int ComputeFrameThreadCount(int thread_count, int tile_count,
+ int tile_columns) {
+ assert(thread_count > 0);
+ if (thread_count == 1) return 0;
+ return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
+ ? 0
+ : std::max(2, thread_count / (1 + tile_columns));
+}
+
+} // namespace
+
+bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
+ int thread_count) {
+ assert(thread_count > 0);
+ frame_parallel_ = false;
+
+ if (thread_count == 1) {
+ thread_pool_.reset(nullptr);
+ tile_thread_count_ = 0;
+ max_tile_index_for_row_threads_ = 0;
+ return true;
+ }
+
+ // We do work in the current thread, so it is sufficient to create
+ // |thread_count|-1 threads in the threadpool.
+ thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
+
+ if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+ thread_pool_ = ThreadPool::Create("libgav1", thread_count);
+ if (thread_pool_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+ thread_count);
+ tile_thread_count_ = 0;
+ max_tile_index_for_row_threads_ = 0;
+ return false;
+ }
+ }
+
+ // Prefer tile threads first (but only if there is more than one tile).
+ const int tile_count = frame_header.tile_info.tile_count;
+ if (tile_count > 1) {
+ // We want 1 + tile_thread_count_ <= tile_count because the current thread
+ // is also used to decode tiles. This is equivalent to
+ // tile_thread_count_ <= tile_count - 1.
+ tile_thread_count_ = std::min(thread_count, tile_count - 1);
+ thread_count -= tile_thread_count_;
+ if (thread_count == 0) {
+ max_tile_index_for_row_threads_ = 0;
+ return true;
+ }
+ } else {
+ tile_thread_count_ = 0;
+ }
+
+#if defined(__ANDROID__)
+ // Assign the remaining threads for each Tile. The heuristic used here is that
+ // we will assign two threads for each Tile. So for example, if |thread_count|
+ // is 2, for a stream with 2 tiles the first tile would get both the threads
+ // and the second tile would have row multi-threading turned off. This
+ // heuristic is based on the fact that row multi-threading is fast enough only
+ // when there are at least two threads to do the decoding (since one thread
+ // always does the parsing).
+ //
+ // This heuristic might stop working when SIMD optimizations make the decoding
+ // much faster and the parsing thread is only as fast as the decoding threads.
+ // So we will have to revisit this later to make sure that this is still
+ // optimal.
+ //
+ // Note that while this heuristic significantly improves performance on high
+ // end devices (like the Pixel 3), there are some performance regressions in
+ // some lower end devices (in some cases) and that needs to be revisited as we
+ // bring in more optimizations. Overall, the gains because of this heuristic
+ // seems to be much larger than the regressions.
+ for (int i = 0; i < tile_count; ++i) {
+ max_tile_index_for_row_threads_ = i + 1;
+ thread_count -= 2;
+ if (thread_count <= 0) break;
+ }
+#else // !defined(__ANDROID__)
+ // Assign the remaining threads to each Tile.
+ for (int i = 0; i < tile_count; ++i) {
+ const int count = thread_count / tile_count +
+ static_cast<int>(i < thread_count % tile_count);
+ if (count == 0) {
+ // Once we see a 0 value, all subsequent values will be 0 since it is
+ // supposed to be assigned in a round-robin fashion.
+ break;
+ }
+ max_tile_index_for_row_threads_ = i + 1;
+ }
+#endif // defined(__ANDROID__)
+ return true;
+}
+
+bool ThreadingStrategy::Reset(int thread_count) {
+ assert(thread_count > 0);
+ frame_parallel_ = true;
+
+ // In frame parallel mode, we simply access the underlying |thread_pool_|
+ // directly. So ensure all the other threadpool getter functions return
+ // nullptr. Also, superblock row multithreading is always disabled in frame
+ // parallel mode.
+ tile_thread_count_ = 0;
+ max_tile_index_for_row_threads_ = 0;
+
+ if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+ thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
+ if (thread_pool_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+ thread_count);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool InitializeThreadPoolsForFrameParallel(
+ int thread_count, int tile_count, int tile_columns,
+ std::unique_ptr<ThreadPool>* const frame_thread_pool,
+ FrameScratchBufferPool* const frame_scratch_buffer_pool) {
+ assert(*frame_thread_pool == nullptr);
+ thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
+ const int frame_threads =
+ ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
+ if (frame_threads == 0) return true;
+ *frame_thread_pool = ThreadPool::Create(frame_threads);
+ if (*frame_thread_pool == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
+ frame_threads);
+ return false;
+ }
+ int remaining_threads = thread_count - frame_threads;
+ if (remaining_threads == 0) return true;
+ int threads_per_frame = remaining_threads / frame_threads;
+ const int extra_threads = remaining_threads % frame_threads;
+ Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+ if (!frame_scratch_buffers.reserve(frame_threads)) return false;
+ // Create the tile thread pools.
+ for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool->Get();
+ if (frame_scratch_buffer == nullptr) {
+ return false;
+ }
+ // If the number of tile threads cannot be divided equally amongst all the
+ // frame threads, assign one extra thread to the first |extra_threads| frame
+ // threads.
+ const int current_frame_thread_count =
+ threads_per_frame + static_cast<int>(i < extra_threads);
+ if (!frame_scratch_buffer->threading_strategy.Reset(
+ current_frame_thread_count)) {
+ return false;
+ }
+ remaining_threads -= current_frame_thread_count;
+ frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
+ }
+ // We release the frame scratch buffers in reverse order so that the extra
+ // threads are allocated to buffers in the top of the stack.
+ for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
+ --i) {
+ frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
+ }
+ return true;
+}
+
+} // namespace libgav1
diff --git a/src/threading_strategy.h b/src/threading_strategy.h
new file mode 100644
index 0000000..84b3589
--- /dev/null
+++ b/src/threading_strategy.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_THREADING_STRATEGY_H_
+#define LIBGAV1_SRC_THREADING_STRATEGY_H_
+
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+class FrameScratchBufferPool;
+
+// This class allocates and manages the worker threads among thread pools used
+// for multi-threaded decoding.
+class ThreadingStrategy {
+ public:
+ ThreadingStrategy() = default;
+
+ // Not copyable or movable.
+ ThreadingStrategy(const ThreadingStrategy&) = delete;
+ ThreadingStrategy& operator=(const ThreadingStrategy&) = delete;
+
+ // Creates or re-allocates the thread pools based on the |frame_header| and
+ // |thread_count|. This function is used only in non frame-parallel mode. This
+ // function is idempotent if the |frame_header| and |thread_count| don't
+ // change between calls (it will only create new threads on the first call and
+ // do nothing on the subsequent calls). This function also starts the worker
+ // threads whenever it creates new thread pools.
+ // The following strategy is used to allocate threads:
+ // * One thread is allocated for decoding each Tile.
+ // * Any remaining threads are allocated for superblock row multi-threading
+ // within each of the tile in a round robin fashion.
+ // Note: During the lifetime of a ThreadingStrategy object, only one of the
+ // Reset() variants will be used.
+ LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header,
+ int thread_count);
+
+ // Creates or re-allocates a thread pool with |thread_count| threads. This
+ // function is used only in frame parallel mode. This function is idempotent
+ // if the |thread_count| doesn't change between calls (it will only create new
+ // threads on the first call and do nothing on the subsequent calls).
+ // Note: During the lifetime of a ThreadingStrategy object, only one of the
+ // Reset() variants will be used.
+ LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count);
+
+ // Returns a pointer to the ThreadPool that is to be used for Tile
+ // multi-threading.
+ ThreadPool* tile_thread_pool() const {
+ return (tile_thread_count_ != 0) ? thread_pool_.get() : nullptr;
+ }
+
+ int tile_thread_count() const { return tile_thread_count_; }
+
+ // Returns a pointer to the underlying ThreadPool.
+ // Note: Valid only when |frame_parallel_| is true. This is used for
+ // facilitating in-frame multi-threading in that case.
+ ThreadPool* thread_pool() const { return thread_pool_.get(); }
+
+ // Returns a pointer to the ThreadPool that is to be used within the Tile at
+ // index |tile_index| for superblock row multi-threading.
+ // Note: Valid only when |frame_parallel_| is false.
+ ThreadPool* row_thread_pool(int tile_index) const {
+ return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get()
+ : nullptr;
+ }
+
+ // Returns a pointer to the ThreadPool that is to be used for post filter
+ // multi-threading.
+ // Note: Valid only when |frame_parallel_| is false.
+ ThreadPool* post_filter_thread_pool() const {
+ return frame_parallel_ ? nullptr : thread_pool_.get();
+ }
+
+ // Returns a pointer to the ThreadPool that is to be used for film grain
+ // synthesis and blending.
+ // Note: Valid only when |frame_parallel_| is false.
+ ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); }
+
+ private:
+ std::unique_ptr<ThreadPool> thread_pool_;
+ int tile_thread_count_ = 0;
+ int max_tile_index_for_row_threads_ = 0;
+ bool frame_parallel_ = false;
+};
+
+// Initializes the |frame_thread_pool| and the necessary worker threadpools (the
+// threading_strategy objects in each of the frame scratch buffer in
+// |frame_scratch_buffer_pool|) as follows:
+// * frame_threads = ComputeFrameThreadCount();
+// * For more details on how frame_threads is computed, see the function
+// comment in ComputeFrameThreadCount().
+// * |frame_thread_pool| is created with |frame_threads| threads.
+// * divide the remaining number of threads into each frame thread and
+// initialize a frame_scratch_buffer.threading_strategy for each frame
+// thread.
+// When this function is called, |frame_scratch_buffer_pool| must be empty. If
+// this function returns true, it means the initialization was successful and
+// one of the following is true:
+// * |frame_thread_pool| has been successfully initialized and
+// |frame_scratch_buffer_pool| has been successfully populated with
+// |frame_threads| buffers to be used by each frame thread. The total
+// number of threads that this function creates will always be equal to
+// |thread_count|.
+// * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not
+// modified. This means that frame threading will not be used and the
+// decoder will continue to operate normally in non frame parallel mode.
+LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel(
+ int thread_count, int tile_count, int tile_columns,
+ std::unique_ptr<ThreadPool>* frame_thread_pool,
+ FrameScratchBufferPool* frame_scratch_buffer_pool);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_THREADING_STRATEGY_H_
diff --git a/src/tile.h b/src/tile.h
new file mode 100644
index 0000000..73bb5fd
--- /dev/null
+++ b/src/tile.h
@@ -0,0 +1,914 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_H_
+#define LIBGAV1_SRC_TILE_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/parameter_tree.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Indicates what the ProcessSuperBlock() and TransformBlock() functions should
+// do. "Parse" refers to consuming the bitstream, reading the transform
+// coefficients and performing the dequantization. "Decode" refers to computing
+// the prediction, applying the inverse transforms and adding the residual.
+enum ProcessingMode {
+ kProcessingModeParseOnly,
+ kProcessingModeDecodeOnly,
+ kProcessingModeParseAndDecode,
+};
+
+class Tile : public Allocable {
+ public:
+ static std::unique_ptr<Tile> Create(
+ int tile_number, const uint8_t* const data, size_t size,
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame,
+ const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+ const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
+ SymbolDecoderContext* const saved_symbol_decoder_context,
+ const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
+ const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
+ BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+ bool use_intra_prediction_buffer) {
+ std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
+ tile_number, data, size, sequence_header, frame_header, current_frame,
+ state, frame_scratch_buffer, wedge_masks, quantizer_matrix,
+ saved_symbol_decoder_context, prev_segment_ids, post_filter, dsp,
+ thread_pool, pending_tiles, frame_parallel,
+ use_intra_prediction_buffer));
+ return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
+ }
+
+ // Move only.
+ Tile(Tile&& tile) noexcept;
+ Tile& operator=(Tile&& tile) noexcept;
+ Tile(const Tile&) = delete;
+ Tile& operator=(const Tile&) = delete;
+
+ struct Block; // Defined after this class.
+
+ // Parses the entire tile.
+ bool Parse();
+ // Decodes the entire tile. |superblock_row_progress| and
+ // |superblock_row_progress_condvar| are arrays of size equal to the number of
+ // superblock rows in the frame. Increments |superblock_row_progress[i]| after
+ // each superblock row at index |i| is decoded. If the count reaches the
+ // number of tile columns, then it notifies
+ // |superblock_row_progress_condvar[i]|.
+ bool Decode(std::mutex* mutex, int* superblock_row_progress,
+ std::condition_variable* superblock_row_progress_condvar);
+ // Parses and decodes the entire tile. Depending on the configuration of this
+ // Tile, this function may do multithreaded decoding.
+ bool ParseAndDecode(); // 5.11.2.
+ // Processes all the columns of the superblock row at |row4x4| that are within
+ // this Tile. If |save_symbol_decoder_context| is true, then
+ // SaveSymbolDecoderContext() is invoked for the last superblock row.
+ template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+ bool ProcessSuperBlockRow(int row4x4, TileScratchBuffer* scratch_buffer);
+
+ const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+ const ObuFrameHeader& frame_header() const { return frame_header_; }
+ const RefCountedBuffer& current_frame() const { return current_frame_; }
+ const TemporalMotionField& motion_field() const { return motion_field_; }
+ const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias()
+ const {
+ return reference_frame_sign_bias_;
+ }
+
+ bool IsRow4x4Inside(int row4x4) const {
+ return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_;
+ }
+
+ // 5.11.51.
+ bool IsInside(int row4x4, int column4x4) const {
+ return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ &&
+ column4x4 < column4x4_end_;
+ }
+
+ bool IsLeftInside(int column4x4) const {
+ // We use "larger than" as the condition. Don't pass in the left column
+ // offset column4x4 - 1.
+ assert(column4x4 <= column4x4_end_);
+ return column4x4 > column4x4_start_;
+ }
+
+ bool IsTopInside(int row4x4) const {
+ // We use "larger than" as the condition. Don't pass in the top row offset
+ // row4x4 - 1.
+ assert(row4x4 <= row4x4_end_);
+ return row4x4 > row4x4_start_;
+ }
+
+ bool IsTopLeftInside(int row4x4, int column4x4) const {
+ // We use "larger than" as the condition. Don't pass in the top row offset
+ // row4x4 - 1 or the left column offset column4x4 - 1.
+ assert(row4x4 <= row4x4_end_);
+ assert(column4x4 <= column4x4_end_);
+ return row4x4 > row4x4_start_ && column4x4 > column4x4_start_;
+ }
+
+ bool IsBottomRightInside(int row4x4, int column4x4) const {
+ assert(row4x4 >= row4x4_start_);
+ assert(column4x4 >= column4x4_start_);
+ return row4x4 < row4x4_end_ && column4x4 < column4x4_end_;
+ }
+
+ BlockParameters** BlockParametersAddress(int row4x4, int column4x4) const {
+ return block_parameters_holder_.Address(row4x4, column4x4);
+ }
+
+ int BlockParametersStride() const {
+ return block_parameters_holder_.columns4x4();
+ }
+
+ // Returns true if Parameters() can be called with |row| and |column| as
+ // inputs, false otherwise.
+ bool HasParameters(int row, int column) const {
+ return block_parameters_holder_.Find(row, column) != nullptr;
+ }
+ const BlockParameters& Parameters(int row, int column) const {
+ return *block_parameters_holder_.Find(row, column);
+ }
+
+ int number() const { return number_; }
+ int superblock_rows() const { return superblock_rows_; }
+ int superblock_columns() const { return superblock_columns_; }
+ int row4x4_start() const { return row4x4_start_; }
+ int column4x4_start() const { return column4x4_start_; }
+ int column4x4_end() const { return column4x4_end_; }
+
+ private:
+ // Stores the transform tree state when reading variable size transform trees
+ // and when applying the transform tree. When applying the transform tree,
+ // |depth| is not used.
+ struct TransformTreeNode {
+ // The default constructor is invoked by the Stack<TransformTreeNode, n>
+ // constructor. Stack<> does not use the default-constructed elements, so it
+ // is safe for the default constructor to not initialize the members.
+ TransformTreeNode() = default;
+ TransformTreeNode(int x, int y, TransformSize tx_size, int depth = -1)
+ : x(x), y(y), tx_size(tx_size), depth(depth) {}
+
+ int x;
+ int y;
+ TransformSize tx_size;
+ int depth;
+ };
+
+ // Enum to track the processing state of a superblock.
+ enum SuperBlockState : uint8_t {
+ kSuperBlockStateNone, // Not yet parsed or decoded.
+ kSuperBlockStateParsed, // Parsed but not yet decoded.
+ kSuperBlockStateScheduled, // Scheduled for decoding.
+ kSuperBlockStateDecoded // Parsed and decoded.
+ };
+
+ // Parameters used to facilitate multi-threading within the Tile.
+ struct ThreadingParameters {
+ std::mutex mutex;
+ // 2d array of size |superblock_rows_| by |superblock_columns_| containing
+ // the processing state of each superblock.
+ Array2D<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
+ // Variable used to indicate either parse or decode failure.
+ bool abort LIBGAV1_GUARDED_BY(mutex) = false;
+ int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0;
+ std::condition_variable pending_jobs_zero_condvar;
+ };
+
+ // The residual pointer is used to traverse the |residual_buffer_|. It is
+ // used in two different ways.
+ // If |split_parse_and_decode_| is true:
+ // The pointer points to the beginning of the |residual_buffer_| when the
+ // "parse" and "decode" steps begin. It is then moved forward tx_size in
+ // each iteration of the "parse" and the "decode" steps. In this case, the
+ // ResidualPtr variable passed into various functions starting from
+ // ProcessSuperBlock is used as an in/out parameter to keep track of the
+ // residual pointer.
+ // If |split_parse_and_decode_| is false:
+ // The pointer is reset to the beginning of the |residual_buffer_| for
+ // every transform block.
+ using ResidualPtr = uint8_t*;
+
+ Tile(int tile_number, const uint8_t* data, size_t size,
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
+ const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
+ const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
+ SymbolDecoderContext* saved_symbol_decoder_context,
+ const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
+ const dsp::Dsp* dsp, ThreadPool* thread_pool,
+ BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+ bool use_intra_prediction_buffer);
+
+ // Performs member initializations that may fail. Helper function used by
+ // Create().
+ LIBGAV1_MUST_USE_RESULT bool Init();
+
+ // Saves the symbol decoder context of this tile into
+ // |saved_symbol_decoder_context_| if necessary.
+ void SaveSymbolDecoderContext();
+
+ // Entry point for multi-threaded decoding. This function performs the same
+ // functionality as ParseAndDecode(). The current thread does the "parse" step
+ // while the worker threads do the "decode" step.
+ bool ThreadedParseAndDecode();
+
+ // Returns whether or not the prerequisites for decoding the superblock at
+ // |row_index| and |column_index| are satisfied. |threading_.mutex| must be
+ // held when calling this function.
+ bool CanDecode(int row_index, int column_index) const;
+
+ // This function is run by the worker threads when multi-threaded decoding is
+ // enabled. Once a superblock is decoded, this function will set the
+ // corresponding |threading_.sb_state| entry to kSuperBlockStateDecoded. On
+ // failure, |threading_.abort| will be set to true. If at any point
+ // |threading_.abort| becomes true, this function will return as early as it
+ // can. If the decoding succeeds, this function will also schedule the
+ // decoding jobs for the superblock to the bottom-left and the superblock to
+ // the right of this superblock (if it is allowed).
+ void DecodeSuperBlock(int row_index, int column_index, int block_width4x4);
+
+ // If |use_intra_prediction_buffer_| is true, then this function copies the
+ // last row of the superblockrow starting at |row4x4| into the
+ // |intra_prediction_buffer_| (which may be used by the intra prediction
+ // process for the next superblock row).
+ void PopulateIntraPredictionBuffer(int row4x4);
+
+ uint16_t* GetPartitionCdf(int row4x4, int column4x4, BlockSize block_size);
+ bool ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+ bool has_rows, bool has_columns, Partition* partition);
+ // Processes the Partition starting at |row4x4_start|, |column4x4_start|
+ // iteratively. It performs a DFS traversal over the partition tree to process
+ // the blocks in the right order.
+ bool ProcessPartition(
+ int row4x4_start, int column4x4_start, ParameterTree* root,
+ TileScratchBuffer* scratch_buffer,
+ ResidualPtr* residual); // Iterative implementation of 5.11.4.
+ bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+ ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+ ResidualPtr* residual); // 5.11.5.
+ void ResetCdef(int row4x4, int column4x4); // 5.11.55.
+
+ // This function is used to decode a superblock when the parsing has already
+ // been done for that superblock.
+ bool DecodeSuperBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+ ResidualPtr* residual);
+ // Helper function used by DecodeSuperBlock(). Note that the decode_block()
+ // function in the spec is equivalent to ProcessBlock() in the code.
+ bool DecodeBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+ ResidualPtr* residual);
+
+ void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
+ int column4x4); // 5.11.3.
+ bool ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+ TileScratchBuffer* scratch_buffer,
+ ProcessingMode mode);
+ void ResetLoopRestorationParams();
+ void ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+ BlockSize block_size); // 5.11.57.
+
+ // Helper functions for DecodeBlock.
+ bool ReadSegmentId(const Block& block); // 5.11.9.
+ bool ReadIntraSegmentId(const Block& block); // 5.11.8.
+ void ReadSkip(const Block& block); // 5.11.11.
+ void ReadSkipMode(const Block& block); // 5.11.10.
+ void ReadCdef(const Block& block); // 5.11.56.
+ // Returns the new value. |cdf| is an array of size kDeltaSymbolCount + 1.
+ int ReadAndClipDelta(uint16_t* cdf, int delta_small, int scale, int min_value,
+ int max_value, int value);
+ void ReadQuantizerIndexDelta(const Block& block); // 5.11.12.
+ void ReadLoopFilterDelta(const Block& block); // 5.11.13.
+ // Populates |BlockParameters::deblock_filter_level| for the given |block|
+ // using |deblock_filter_levels_|.
+ void PopulateDeblockFilterLevel(const Block& block);
+ void ReadPredictionModeY(const Block& block, bool intra_y_mode);
+ void ReadIntraAngleInfo(const Block& block,
+ PlaneType plane_type); // 5.11.42 and 5.11.43.
+ void ReadPredictionModeUV(const Block& block);
+ void ReadCflAlpha(const Block& block); // 5.11.45.
+ int GetPaletteCache(const Block& block, PlaneType plane_type,
+ uint16_t* cache);
+ void ReadPaletteColors(const Block& block, Plane plane);
+ void ReadPaletteModeInfo(const Block& block); // 5.11.46.
+ void ReadFilterIntraModeInfo(const Block& block); // 5.11.24.
+ int ReadMotionVectorComponent(const Block& block,
+ int component); // 5.11.32.
+ void ReadMotionVector(const Block& block, int index); // 5.11.31.
+ bool DecodeIntraModeInfo(const Block& block); // 5.11.7.
+ int8_t ComputePredictedSegmentId(const Block& block) const; // 5.11.21.
+ bool ReadInterSegmentId(const Block& block, bool pre_skip); // 5.11.19.
+ void ReadIsInter(const Block& block); // 5.11.20.
+ bool ReadIntraBlockModeInfo(const Block& block,
+ bool intra_y_mode); // 5.11.22.
+ CompoundReferenceType ReadCompoundReferenceType(const Block& block);
+ template <bool is_single, bool is_backward, int index>
+ uint16_t* GetReferenceCdf(const Block& block, CompoundReferenceType type =
+ kNumCompoundReferenceTypes);
+ void ReadReferenceFrames(const Block& block); // 5.11.25.
+ void ReadInterPredictionModeY(const Block& block,
+ const MvContexts& mode_contexts);
+ void ReadRefMvIndex(const Block& block);
+ void ReadInterIntraMode(const Block& block, bool is_compound); // 5.11.28.
+ bool IsScaled(ReferenceFrameType type) const { // Part of 5.11.27.
+ const int index =
+ frame_header_.reference_frame_index[type - kReferenceFrameLast];
+ return reference_frames_[index]->upscaled_width() != frame_header_.width ||
+ reference_frames_[index]->frame_height() != frame_header_.height;
+ }
+ void ReadMotionMode(const Block& block, bool is_compound); // 5.11.27.
+ uint16_t* GetIsExplicitCompoundTypeCdf(const Block& block);
+ uint16_t* GetIsCompoundTypeAverageCdf(const Block& block);
+ void ReadCompoundType(const Block& block, bool is_compound); // 5.11.29.
+ uint16_t* GetInterpolationFilterCdf(const Block& block, int direction);
+ void ReadInterpolationFilter(const Block& block);
+ bool ReadInterBlockModeInfo(const Block& block); // 5.11.23.
+ bool DecodeInterModeInfo(const Block& block); // 5.11.18.
+ bool DecodeModeInfo(const Block& block); // 5.11.6.
+ bool IsMvValid(const Block& block, bool is_compound) const; // 6.10.25.
+ bool AssignInterMv(const Block& block, bool is_compound); // 5.11.26.
+ bool AssignIntraMv(const Block& block); // 5.11.26.
+ int GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+ bool ignore_skip);
+ int GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+ bool ignore_skip);
+ TransformSize ReadFixedTransformSize(const Block& block); // 5.11.15.
+ // Iterative implementation of 5.11.17.
+ void ReadVariableTransformTree(const Block& block, int row4x4, int column4x4,
+ TransformSize tx_size);
+ void DecodeTransformSize(const Block& block); // 5.11.16.
+ bool ComputePrediction(const Block& block); // 5.11.33.
+ // |x4| and |y4| are the column and row positions of the 4x4 block. |w4| and
+ // |h4| are the width and height in 4x4 units of |tx_size|.
+ int GetTransformAllZeroContext(const Block& block, Plane plane,
+ TransformSize tx_size, int x4, int y4, int w4,
+ int h4);
+ TransformSet GetTransformSet(TransformSize tx_size,
+ bool is_inter) const; // 5.11.48.
+ TransformType ComputeTransformType(const Block& block, Plane plane,
+ TransformSize tx_size, int block_x,
+ int block_y); // 5.11.40.
+ void ReadTransformType(const Block& block, int x4, int y4,
+ TransformSize tx_size); // 5.11.47.
+ template <typename ResidualType>
+ void ReadCoeffBase2D(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
+ template <typename ResidualType>
+ void ReadCoeffBaseHorizontal(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
+ template <typename ResidualType>
+ void ReadCoeffBaseVertical(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
+ int GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane);
+ void SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+ uint8_t coefficient_level, int8_t dc_category);
+ void InterIntraPrediction(
+ uint16_t* prediction_0, const uint8_t* prediction_mask,
+ ptrdiff_t prediction_mask_stride,
+ const PredictionParameters& prediction_parameters, int prediction_width,
+ int prediction_height, int subsampling_x, int subsampling_y,
+ uint8_t* dest,
+ ptrdiff_t dest_stride); // Part of section 7.11.3.1 in the spec.
+ void CompoundInterPrediction(
+ const Block& block, const uint8_t* prediction_mask,
+ ptrdiff_t prediction_mask_stride, int prediction_width,
+ int prediction_height, int subsampling_x, int subsampling_y,
+ int candidate_row, int candidate_column, uint8_t* dest,
+ ptrdiff_t dest_stride); // Part of section 7.11.3.1 in the spec.
+ GlobalMotion* GetWarpParams(const Block& block, Plane plane,
+ int prediction_width, int prediction_height,
+ const PredictionParameters& prediction_parameters,
+ ReferenceFrameType reference_type,
+ bool* is_local_valid,
+ GlobalMotion* global_motion_params,
+ GlobalMotion* local_warp_params)
+ const; // Part of section 7.11.3.1 in the spec.
+ bool InterPrediction(const Block& block, Plane plane, int x, int y,
+ int prediction_width, int prediction_height,
+ int candidate_row, int candidate_column,
+ bool* is_local_valid,
+ GlobalMotion* local_warp_params); // 7.11.3.1.
+ void ScaleMotionVector(const MotionVector& mv, Plane plane,
+ int reference_frame_index, int x, int y, int* start_x,
+ int* start_y, int* step_x, int* step_y); // 7.11.3.3.
+ // If the method returns false, the caller only uses the output parameters
+ // *ref_block_start_x and *ref_block_start_y. If the method returns true, the
+ // caller uses all three output parameters.
+ static bool GetReferenceBlockPosition(
+ int reference_frame_index, bool is_scaled, int width, int height,
+ int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y,
+ int start_x, int start_y, int step_x, int step_y, int left_border,
+ int right_border, int top_border, int bottom_border,
+ int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x);
+
+ template <typename Pixel>
+ void BuildConvolveBlock(Plane plane, int reference_frame_index,
+ bool is_scaled, int height, int ref_start_x,
+ int ref_last_x, int ref_start_y, int ref_last_y,
+ int step_y, int ref_block_start_x,
+ int ref_block_end_x, int ref_block_start_y,
+ uint8_t* block_buffer,
+ ptrdiff_t convolve_buffer_stride,
+ ptrdiff_t block_extended_width);
+ bool BlockInterPrediction(const Block& block, Plane plane,
+ int reference_frame_index, const MotionVector& mv,
+ int x, int y, int width, int height,
+ int candidate_row, int candidate_column,
+ uint16_t* prediction, bool is_compound,
+ bool is_inter_intra, uint8_t* dest,
+ ptrdiff_t dest_stride); // 7.11.3.4.
+ bool BlockWarpProcess(const Block& block, Plane plane, int index,
+ int block_start_x, int block_start_y, int width,
+ int height, GlobalMotion* warp_params, bool is_compound,
+ bool is_inter_intra, uint8_t* dest,
+ ptrdiff_t dest_stride); // 7.11.3.5.
+ bool ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+ Plane plane, int reference_frame_index, int width,
+ int height, int x, int y, int candidate_row,
+ int candidate_column,
+ ObmcDirection blending_direction);
+ bool ObmcPrediction(const Block& block, Plane plane, int width,
+ int height); // 7.11.3.9.
+ void DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+ int width, int height, int candidate_row,
+ int candidate_column, uint8_t* dest,
+ ptrdiff_t dest_stride); // 7.11.3.15.
+ // This function specializes the parsing of DC coefficient by removing some of
+ // the branches when i == 0 (since scan[0] is always 0 and scan[i] is always
+ // non-zero for all other possible values of i). |dc_category| is an output
+ // parameter that is populated when |is_dc_coefficient| is true.
+ // |coefficient_level| is an output parameter which accumulates the
+ // coefficient level.
+ template <typename ResidualType, bool is_dc_coefficient>
+ LIBGAV1_ALWAYS_INLINE bool ReadSignAndApplyDequantization(
+ const uint16_t* scan, int i, int q_value, const uint8_t* quantizer_matrix,
+ int shift, int max_value, uint16_t* dc_sign_cdf, int8_t* dc_category,
+ int* coefficient_level,
+ ResidualType* residual_buffer); // Part of 5.11.39.
+ int ReadCoeffBaseRange(uint16_t* cdf); // Part of 5.11.39.
+ // Returns the number of non-zero coefficients that were read. |tx_type| is an
+ // output parameter that stores the computed transform type for the plane
+ // whose coefficients were read. Returns -1 on failure.
+ template <typename ResidualType>
+ int ReadTransformCoefficients(const Block& block, Plane plane, int start_x,
+ int start_y, TransformSize tx_size,
+ TransformType* tx_type); // 5.11.39.
+ bool TransformBlock(const Block& block, Plane plane, int base_x, int base_y,
+ TransformSize tx_size, int x, int y,
+ ProcessingMode mode); // 5.11.35.
+ // Iterative implementation of 5.11.36.
+ bool TransformTree(const Block& block, int start_x, int start_y,
+ BlockSize plane_size, ProcessingMode mode);
+ void ReconstructBlock(const Block& block, Plane plane, int start_x,
+ int start_y, TransformSize tx_size,
+ TransformType tx_type,
+ int non_zero_coeff_count); // Part of 7.12.3.
+ bool Residual(const Block& block, ProcessingMode mode); // 5.11.34.
+ // part of 5.11.5 (reset_block_context() in the spec).
+ void ResetEntropyContext(const Block& block);
+ // Populates the |color_context| and |color_order| for the |i|th iteration
+ // with entries counting down from |start| to |end| (|start| > |end|).
+ void PopulatePaletteColorContexts(
+ const Block& block, PlaneType plane_type, int i, int start, int end,
+ uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+ uint8_t color_context[kMaxPaletteSquare]); // 5.11.50.
+ bool ReadPaletteTokens(const Block& block); // 5.11.49.
+ template <typename Pixel>
+ void IntraPrediction(const Block& block, Plane plane, int x, int y,
+ bool has_left, bool has_top, bool has_top_right,
+ bool has_bottom_left, PredictionMode mode,
+ TransformSize tx_size);
+ bool IsSmoothPrediction(int row, int column, Plane plane) const;
+ int GetIntraEdgeFilterType(const Block& block,
+ Plane plane) const; // 7.11.2.8.
+ template <typename Pixel>
+ void DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+ bool has_left, bool has_top, bool needs_left,
+ bool needs_top, int prediction_angle, int width,
+ int height, int max_x, int max_y,
+ TransformSize tx_size, Pixel* top_row,
+ Pixel* left_column); // 7.11.2.4.
+ template <typename Pixel>
+ void PalettePrediction(const Block& block, Plane plane, int start_x,
+ int start_y, int x, int y,
+ TransformSize tx_size); // 7.11.4.
+ template <typename Pixel>
+ void ChromaFromLumaPrediction(const Block& block, Plane plane, int start_x,
+ int start_y,
+ TransformSize tx_size); // 7.11.5.
+ // Section 7.19. Applies some filtering and reordering to the motion vectors
+ // for the given |block| and stores them into |current_frame_|.
+ void StoreMotionFieldMvsIntoCurrentFrame(const Block& block);
+
+ // Returns the zero-based index of the super block that contains |row4x4|
+ // relative to the start of this tile.
+ int SuperBlockRowIndex(int row4x4) const {
+ return (row4x4 - row4x4_start_) >>
+ (sequence_header_.use_128x128_superblock ? 5 : 4);
+ }
+
+ // Returns the zero-based index of the super block that contains |column4x4|
+ // relative to the start of this tile.
+ int SuperBlockColumnIndex(int column4x4) const {
+ return (column4x4 - column4x4_start_) >>
+ (sequence_header_.use_128x128_superblock ? 5 : 4);
+ }
+
+ BlockSize SuperBlockSize() const {
+ return sequence_header_.use_128x128_superblock ? kBlock128x128
+ : kBlock64x64;
+ }
+ int PlaneCount() const {
+ return sequence_header_.color_config.is_monochrome ? kMaxPlanesMonochrome
+ : kMaxPlanes;
+ }
+
+ const int number_;
+ const int row_;
+ const int column_;
+ const uint8_t* const data_;
+ size_t size_;
+ int row4x4_start_;
+ int row4x4_end_;
+ int column4x4_start_;
+ int column4x4_end_;
+ int superblock_rows_;
+ int superblock_columns_;
+ bool read_deltas_;
+ const int8_t subsampling_x_[kMaxPlanes];
+ const int8_t subsampling_y_[kMaxPlanes];
+ int deblock_row_limit_[kMaxPlanes];
+ int deblock_column_limit_[kMaxPlanes];
+
+ // The dimensions (in order) are: segment_id, level_index (based on plane and
+ // direction), reference_frame and mode_id.
+ uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+ [kNumReferenceFrameTypes][2];
+
+ // current_quantizer_index_ is in the range [0, 255].
+ uint8_t current_quantizer_index_;
+ // These two arrays (|coefficient_levels_| and |dc_categories_|) are used to
+ // store the entropy context. Their dimensions are as follows: First -
+ // left/top; Second - plane; Third - row4x4 (if first dimension is
+ // left)/column4x4 (if first dimension is top).
+ //
+ // This is equivalent to the LeftLevelContext and AboveLevelContext arrays in
+ // the spec. In the spec, it stores values from 0 through 63 (inclusive). The
+ // stored values are used to compute the left and top contexts in
+ // GetTransformAllZeroContext. In that function, we only care about the
+ // following values: 0, 1, 2, 3 and >= 4. So instead of clamping to 63, we
+ // clamp to 4 (i.e.) all the values greater than 4 are stored as 4.
+ std::array<Array2D<uint8_t>, 2> coefficient_levels_;
+ // This is equivalent to the LeftDcContext and AboveDcContext arrays in the
+ // spec. In the spec, it can store 3 possible values: 0, 1 and 2 (where 1
+ // means the value is < 0, 2 means the value is > 0 and 0 means the value is
+ // equal to 0).
+ //
+ // The stored values are used in two places:
+ // * GetTransformAllZeroContext: Here, we only care about whether the
+ // value is 0 or not (whether it is 1 or 2 is irrelevant).
+ // * GetDcSignContext: Here, we do the following computation: if the
+ // stored value is 1, we decrement a counter. If the stored value is 2
+ // we increment a counter.
+ //
+ // Based on this usage, we can simply replace 1 with -1 and 2 with 1 and
+ // use that value to compute the counter.
+ //
+ // The usage on GetTransformAllZeroContext is unaffected since there we
+ // only care about whether it is 0 or not.
+ std::array<Array2D<int8_t>, 2> dc_categories_;
+ const ObuSequenceHeader& sequence_header_;
+ const ObuFrameHeader& frame_header_;
+ const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias_;
+ const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+ reference_frames_;
+ TemporalMotionField& motion_field_;
+ const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint_;
+ const WedgeMaskArray& wedge_masks_;
+ const QuantizerMatrix& quantizer_matrix_;
+ DaalaBitReader reader_;
+ SymbolDecoderContext symbol_decoder_context_;
+ SymbolDecoderContext* const saved_symbol_decoder_context_;
+ const SegmentationMap* prev_segment_ids_;
+ const dsp::Dsp& dsp_;
+ PostFilter& post_filter_;
+ BlockParametersHolder& block_parameters_holder_;
+ Quantizer quantizer_;
+ // When there is no multi-threading within the Tile, |residual_buffer_| is
+ // used. When there is multi-threading within the Tile,
+ // |residual_buffer_threaded_| is used. In the following comment,
+ // |residual_buffer| refers to either |residual_buffer_| or
+ // |residual_buffer_threaded_| depending on whether multi-threading is enabled
+ // within the Tile or not.
+ // The |residual_buffer| is used to help with the dequantization and the
+ // inverse transform processes. It is declared as a uint8_t, but is always
+ // accessed either as an int16_t or int32_t depending on |bitdepth|. Here is
+ // what it stores at various stages of the decoding process (in the order
+ // which they happen):
+ // 1) In ReadTransformCoefficients(), this buffer is used to store the
+ // dequantized values.
+ // 2) In Reconstruct(), this buffer is used as the input to the row
+ // transform process.
+ // The size of this buffer would be:
+ // For |residual_buffer_|: (4096 + 32 * |kResidualPaddingVertical|) *
+ // |residual_size_|. Where 4096 = 64x64 which is the maximum transform
+ // size, and 32 * |kResidualPaddingVertical| is the padding to avoid
+ // bottom boundary checks when parsing quantized coefficients. This
+ // memory is allocated and owned by the Tile class.
+ // For |residual_buffer_threaded_|: See the comment below. This memory is
+ // not allocated or owned by the Tile class.
+ AlignedUniquePtr<uint8_t> residual_buffer_;
+ // This is a 2d array of pointers of size |superblock_rows_| by
+ // |superblock_columns_| where each pointer points to a ResidualBuffer for a
+ // single super block. The array is populated when the parsing process begins
+ // by calling |residual_buffer_pool_->Get()| and the memory is released back
+ // to the pool by calling |residual_buffer_pool_->Release()| when the decoding
+ // process is complete.
+ Array2D<std::unique_ptr<ResidualBuffer>> residual_buffer_threaded_;
+ // sizeof(int16_t or int32_t) depending on |bitdepth|.
+ const size_t residual_size_;
+ // Number of superblocks on the top-right that will have to be decoded before
+ // the current superblock can be decoded. This will be 1 if allow_intrabc is
+ // false. If allow_intrabc is true, then this value will be
+ // use_128x128_superblock ? 3 : 5. This is the allowed range of reference for
+ // the top rows for intrabc.
+ const int intra_block_copy_lag_;
+
+ // In the Tile class, we use the "current_frame" in two ways:
+ // 1) To write the decoded output into (using the |buffer_| view).
+ // 2) To read the pixels for intra block copy (using the |current_frame_|
+ // reference).
+ //
+ // When intra block copy is off, |buffer_| and |current_frame_| may or may not
+ // point to the same plane pointers. But it is okay since |current_frame_| is
+ // never used in this case.
+ //
+ // When intra block copy is on, |buffer_| and |current_frame_| always point to
+ // the same plane pointers (since post filtering is disabled). So the usage in
+ // both case 1 and case 2 remain valid.
+ Array2DView<uint8_t> buffer_[kMaxPlanes];
+ RefCountedBuffer& current_frame_;
+
+ Array2D<int16_t>& cdef_index_;
+ Array2D<TransformSize>& inter_transform_sizes_;
+ std::array<RestorationUnitInfo, kMaxPlanes> reference_unit_info_;
+ // If |thread_pool_| is nullptr, the calling thread will do the parsing and
+ // the decoding in one pass. If |thread_pool_| is not nullptr, then the main
+ // thread will do the parsing while the thread pool workers will do the
+ // decoding.
+ ThreadPool* const thread_pool_;
+ ThreadingParameters threading_;
+ ResidualBufferPool* const residual_buffer_pool_;
+ TileScratchBufferPool* const tile_scratch_buffer_pool_;
+ BlockingCounterWithStatus* const pending_tiles_;
+ bool split_parse_and_decode_;
+ // This is used only when |split_parse_and_decode_| is false.
+ std::unique_ptr<PredictionParameters> prediction_parameters_ = nullptr;
+ // Stores the |transform_type| for the super block being decoded at a 4x4
+ // granularity. The spec uses absolute indices for this array but it is
+ // sufficient to use indices relative to the super block being decoded.
+ TransformType transform_types_[32][32];
+ // delta_lf_[i] is in the range [-63, 63].
+ int8_t delta_lf_[kFrameLfCount];
+ // True if all the values in |delta_lf_| are zero. False otherwise.
+ bool delta_lf_all_zero_;
+ const bool frame_parallel_;
+ const bool use_intra_prediction_buffer_;
+ // Buffer used to store the unfiltered pixels that are necessary for decoding
+ // the next superblock row (for the intra prediction process). Used only if
+ // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains
+ // one row buffer for each tile row. This tile will have to use the buffer
+ // corresponding to this tile's row.
+ IntraPredictionBuffer* const intra_prediction_buffer_;
+ // Stores the progress of the reference frames. This will be used to avoid
+ // unnecessary calls into RefCountedBuffer::WaitUntil().
+ std::array<int, kNumReferenceFrameTypes> reference_frame_progress_cache_;
+};
+
+struct Tile::Block {
+ Block(const Tile& tile, BlockSize size, int row4x4, int column4x4,
+ TileScratchBuffer* const scratch_buffer, ResidualPtr* residual)
+ : tile(tile),
+ size(size),
+ row4x4(row4x4),
+ column4x4(column4x4),
+ width(kBlockWidthPixels[size]),
+ height(kBlockHeightPixels[size]),
+ width4x4(width >> 2),
+ height4x4(height >> 2),
+ scratch_buffer(scratch_buffer),
+ residual(residual) {
+ assert(size != kBlockInvalid);
+ residual_size[kPlaneY] = kPlaneResidualSize[size][0][0];
+ residual_size[kPlaneU] = residual_size[kPlaneV] =
+ kPlaneResidualSize[size][tile.subsampling_x_[kPlaneU]]
+ [tile.subsampling_y_[kPlaneU]];
+ assert(residual_size[kPlaneY] != kBlockInvalid);
+ if (tile.PlaneCount() > 1) {
+ assert(residual_size[kPlaneU] != kBlockInvalid);
+ }
+ if ((row4x4 & 1) == 0 &&
+ (tile.sequence_header_.color_config.subsampling_y & height4x4) == 1) {
+ has_chroma = false;
+ } else if ((column4x4 & 1) == 0 &&
+ (tile.sequence_header_.color_config.subsampling_x & width4x4) ==
+ 1) {
+ has_chroma = false;
+ } else {
+ has_chroma = !tile.sequence_header_.color_config.is_monochrome;
+ }
+ top_available[kPlaneY] = tile.IsTopInside(row4x4);
+ left_available[kPlaneY] = tile.IsLeftInside(column4x4);
+ if (has_chroma) {
+ // top_available[kPlaneU] and top_available[kPlaneV] are valid only if
+ // has_chroma is true.
+ // The next 3 lines are equivalent to:
+ // top_available[kPlaneU] = top_available[kPlaneV] =
+ // top_available[kPlaneY] &&
+ // ((tile.sequence_header_.color_config.subsampling_y & height4x4) ==
+ // 0 || tile.IsTopInside(row4x4 - 1));
+ top_available[kPlaneU] = top_available[kPlaneV] = tile.IsTopInside(
+ row4x4 -
+ (tile.sequence_header_.color_config.subsampling_y & height4x4));
+ // left_available[kPlaneU] and left_available[kPlaneV] are valid only if
+ // has_chroma is true.
+ // The next 3 lines are equivalent to:
+ // left_available[kPlaneU] = left_available[kPlaneV] =
+ // left_available[kPlaneY] &&
+ // ((tile.sequence_header_.color_config.subsampling_x & width4x4) == 0
+ // || tile.IsLeftInside(column4x4 - 1));
+ left_available[kPlaneU] = left_available[kPlaneV] = tile.IsLeftInside(
+ column4x4 -
+ (tile.sequence_header_.color_config.subsampling_x & width4x4));
+ }
+ const ptrdiff_t stride = tile.BlockParametersStride();
+ BlockParameters** const bps =
+ tile.BlockParametersAddress(row4x4, column4x4);
+ bp = *bps;
+ // bp_top is valid only if top_available[kPlaneY] is true.
+ if (top_available[kPlaneY]) {
+ bp_top = *(bps - stride);
+ }
+ // bp_left is valid only if left_available[kPlaneY] is true.
+ if (left_available[kPlaneY]) {
+ bp_left = *(bps - 1);
+ }
+ }
+
+ bool HasChroma() const { return has_chroma; }
+
+ // These return values of these group of functions are valid only if the
+ // corresponding top_available or left_available is true.
+ ReferenceFrameType TopReference(int index) const {
+ return bp_top->reference_frame[index];
+ }
+
+ ReferenceFrameType LeftReference(int index) const {
+ return bp_left->reference_frame[index];
+ }
+
+ bool IsTopIntra() const { return TopReference(0) <= kReferenceFrameIntra; }
+ bool IsLeftIntra() const { return LeftReference(0) <= kReferenceFrameIntra; }
+
+ bool IsTopSingle() const { return TopReference(1) <= kReferenceFrameIntra; }
+ bool IsLeftSingle() const { return LeftReference(1) <= kReferenceFrameIntra; }
+
+ int CountReferences(ReferenceFrameType type) const {
+ return static_cast<int>(top_available[kPlaneY] &&
+ bp_top->reference_frame[0] == type) +
+ static_cast<int>(top_available[kPlaneY] &&
+ bp_top->reference_frame[1] == type) +
+ static_cast<int>(left_available[kPlaneY] &&
+ bp_left->reference_frame[0] == type) +
+ static_cast<int>(left_available[kPlaneY] &&
+ bp_left->reference_frame[1] == type);
+ }
+
+ // 7.10.3.
+ // Checks if there are any inter blocks to the left or above. If so, it
+ // returns true indicating that the block has neighbors that are suitable for
+ // use by overlapped motion compensation.
+ bool HasOverlappableCandidates() const {
+ const ptrdiff_t stride = tile.BlockParametersStride();
+ BlockParameters** const bps = tile.BlockParametersAddress(0, 0);
+ if (top_available[kPlaneY]) {
+ BlockParameters** bps_top = bps + (row4x4 - 1) * stride + (column4x4 | 1);
+ const int columns = std::min(tile.frame_header_.columns4x4 - column4x4,
+ static_cast<int>(width4x4));
+ BlockParameters** const bps_top_end = bps_top + columns;
+ do {
+ if ((*bps_top)->reference_frame[0] > kReferenceFrameIntra) {
+ return true;
+ }
+ bps_top += 2;
+ } while (bps_top < bps_top_end);
+ }
+ if (left_available[kPlaneY]) {
+ BlockParameters** bps_left = bps + (row4x4 | 1) * stride + column4x4 - 1;
+ const int rows = std::min(tile.frame_header_.rows4x4 - row4x4,
+ static_cast<int>(height4x4));
+ BlockParameters** const bps_left_end = bps_left + rows * stride;
+ do {
+ if ((*bps_left)->reference_frame[0] > kReferenceFrameIntra) {
+ return true;
+ }
+ bps_left += 2 * stride;
+ } while (bps_left < bps_left_end);
+ }
+ return false;
+ }
+
+ const Tile& tile;
+ bool has_chroma;
+ const BlockSize size;
+ bool top_available[kMaxPlanes];
+ bool left_available[kMaxPlanes];
+ BlockSize residual_size[kMaxPlanes];
+ const int row4x4;
+ const int column4x4;
+ const int width;
+ const int height;
+ const int width4x4;
+ const int height4x4;
+ const BlockParameters* bp_top;
+ const BlockParameters* bp_left;
+ BlockParameters* bp;
+ TileScratchBuffer* const scratch_buffer;
+ ResidualPtr* const residual;
+};
+
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ int row4x4, TileScratchBuffer* scratch_buffer);
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ int row4x4, TileScratchBuffer* scratch_buffer);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_TILE_H_
diff --git a/src/tile/bitstream/mode_info.cc b/src/tile/bitstream/mode_info.cc
new file mode 100644
index 0000000..0b22eb0
--- /dev/null
+++ b/src/tile/bitstream/mode_info.cc
@@ -0,0 +1,1303 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kDeltaQSmall = 3;
+constexpr int kDeltaLfSmall = 3;
+
+constexpr uint8_t kIntraYModeContext[kIntraPredictionModesY] = {
+ 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0};
+
+constexpr uint8_t kSizeGroup[kMaxBlockSizes] = {
+ 0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 3, 3};
+
+constexpr int kCompoundModeNewMvContexts = 5;
+constexpr uint8_t kCompoundModeContextMap[3][kCompoundModeNewMvContexts] = {
+ {0, 1, 1, 1, 1}, {1, 2, 3, 4, 4}, {4, 4, 5, 6, 7}};
+
+enum CflSign : uint8_t {
+ kCflSignZero = 0,
+ kCflSignNegative = 1,
+ kCflSignPositive = 2
+};
+
+// For each possible value of the combined signs (which is read from the
+// bitstream), this array stores the following: sign_u, sign_v, alpha_u_context,
+// alpha_v_context. Only positive entries are used. Entry at index i is computed
+// as follows:
+// sign_u = i / 3
+// sign_v = i % 3
+// alpha_u_context = i - 2
+// alpha_v_context = (sign_v - 1) * 3 + sign_u
+constexpr int8_t kCflAlphaLookup[kCflAlphaSignsSymbolCount][4] = {
+ {0, 1, -2, 0}, {0, 2, -1, 3}, {1, 0, 0, -2}, {1, 1, 1, 1},
+ {1, 2, 2, 4}, {2, 0, 3, -1}, {2, 1, 4, 2}, {2, 2, 5, 5},
+};
+
+constexpr BitMaskSet kPredictionModeHasNearMvMask(kPredictionModeNearMv,
+ kPredictionModeNearNearMv,
+ kPredictionModeNearNewMv,
+ kPredictionModeNewNearMv);
+
+constexpr BitMaskSet kIsInterIntraModeAllowedMask(kBlock8x8, kBlock8x16,
+ kBlock16x8, kBlock16x16,
+ kBlock16x32, kBlock32x16,
+ kBlock32x32);
+
+bool IsBackwardReference(ReferenceFrameType type) {
+ return type >= kReferenceFrameBackward && type <= kReferenceFrameAlternate;
+}
+
+bool IsSameDirectionReferencePair(ReferenceFrameType type1,
+ ReferenceFrameType type2) {
+ return (type1 >= kReferenceFrameBackward) ==
+ (type2 >= kReferenceFrameBackward);
+}
+
+// This is called neg_deinterleave() in the spec.
+int DecodeSegmentId(int diff, int reference, int max) {
+ if (reference == 0) return diff;
+ if (reference >= max - 1) return max - diff - 1;
+ const int value = ((diff & 1) != 0) ? reference + ((diff + 1) >> 1)
+ : reference - (diff >> 1);
+ const int reference2 = (reference << 1);
+ if (reference2 < max) {
+ return (diff <= reference2) ? value : diff;
+ }
+ return (diff <= ((max - reference - 1) << 1)) ? value : max - (diff + 1);
+}
+
+// This is called DrlCtxStack in section 7.10.2.14 of the spec.
+// In the spec, the weights of all the nearest mvs are incremented by a bonus
+// weight which is larger than any natural weight, and the weights of the mvs
+// are compared with this bonus weight to determine their contexts. We replace
+// this procedure by introducing |nearest_mv_count| in PredictionParameters,
+// which records the count of the nearest mvs. Since all the nearest mvs are in
+// the beginning of the mv stack, the |index| of a mv in the mv stack can be
+// compared with |nearest_mv_count| to get that mv's context.
+int GetRefMvIndexContext(int nearest_mv_count, int index) {
+ if (index + 1 < nearest_mv_count) {
+ return 0;
+ }
+ if (index + 1 == nearest_mv_count) {
+ return 1;
+ }
+ return 2;
+}
+
+// Returns true if both the width and height of the block is less than 64.
+bool IsBlockDimensionLessThan64(BlockSize size) {
+ return size <= kBlock32x32 && size != kBlock16x64;
+}
+
+int GetUseCompoundReferenceContext(const Tile::Block& block) {
+ if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+ if (block.IsTopSingle() && block.IsLeftSingle()) {
+ return static_cast<int>(IsBackwardReference(block.TopReference(0))) ^
+ static_cast<int>(IsBackwardReference(block.LeftReference(0)));
+ }
+ if (block.IsTopSingle()) {
+ return 2 + static_cast<int>(IsBackwardReference(block.TopReference(0)) ||
+ block.IsTopIntra());
+ }
+ if (block.IsLeftSingle()) {
+ return 2 + static_cast<int>(IsBackwardReference(block.LeftReference(0)) ||
+ block.IsLeftIntra());
+ }
+ return 4;
+ }
+ if (block.top_available[kPlaneY]) {
+ return block.IsTopSingle()
+ ? static_cast<int>(IsBackwardReference(block.TopReference(0)))
+ : 3;
+ }
+ if (block.left_available[kPlaneY]) {
+ return block.IsLeftSingle()
+ ? static_cast<int>(IsBackwardReference(block.LeftReference(0)))
+ : 3;
+ }
+ return 1;
+}
+
+// Calculates count0 by calling block.CountReferences() on the frame types from
+// type0_start to type0_end, inclusive, and summing the results.
+// Calculates count1 by calling block.CountReferences() on the frame types from
+// type1_start to type1_end, inclusive, and summing the results.
+// Compares count0 with count1 and returns 0, 1 or 2.
+//
+// See count_refs and ref_count_ctx in 8.3.2.
+int GetReferenceContext(const Tile::Block& block,
+ ReferenceFrameType type0_start,
+ ReferenceFrameType type0_end,
+ ReferenceFrameType type1_start,
+ ReferenceFrameType type1_end) {
+ int count0 = 0;
+ int count1 = 0;
+ for (int type = type0_start; type <= type0_end; ++type) {
+ count0 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+ }
+ for (int type = type1_start; type <= type1_end; ++type) {
+ count1 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+ }
+ return (count0 < count1) ? 0 : (count0 == count1 ? 1 : 2);
+}
+
+} // namespace
+
+bool Tile::ReadSegmentId(const Block& block) {
+ int top_left = -1;
+ if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+ top_left =
+ block_parameters_holder_.Find(block.row4x4 - 1, block.column4x4 - 1)
+ ->segment_id;
+ }
+ int top = -1;
+ if (block.top_available[kPlaneY]) {
+ top = block.bp_top->segment_id;
+ }
+ int left = -1;
+ if (block.left_available[kPlaneY]) {
+ left = block.bp_left->segment_id;
+ }
+ int pred;
+ if (top == -1) {
+ pred = (left == -1) ? 0 : left;
+ } else if (left == -1) {
+ pred = top;
+ } else {
+ pred = (top_left == top) ? top : left;
+ }
+ BlockParameters& bp = *block.bp;
+ if (bp.skip) {
+ bp.segment_id = pred;
+ return true;
+ }
+ int context = 0;
+ if (top_left < 0) {
+ context = 0;
+ } else if (top_left == top && top_left == left) {
+ context = 2;
+ } else if (top_left == top || top_left == left || top == left) {
+ context = 1;
+ }
+ uint16_t* const segment_id_cdf =
+ symbol_decoder_context_.segment_id_cdf[context];
+ const int encoded_segment_id =
+ reader_.ReadSymbol<kMaxSegments>(segment_id_cdf);
+ bp.segment_id =
+ DecodeSegmentId(encoded_segment_id, pred,
+ frame_header_.segmentation.last_active_segment_id + 1);
+ // Check the bitstream conformance requirement in Section 6.10.8 of the spec.
+ if (bp.segment_id < 0 ||
+ bp.segment_id > frame_header_.segmentation.last_active_segment_id) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Corrupted segment_ids: encoded %d, last active %d, postprocessed %d",
+ encoded_segment_id, frame_header_.segmentation.last_active_segment_id,
+ bp.segment_id);
+ return false;
+ }
+ return true;
+}
+
+bool Tile::ReadIntraSegmentId(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (!frame_header_.segmentation.enabled) {
+ bp.segment_id = 0;
+ return true;
+ }
+ return ReadSegmentId(block);
+}
+
+void Tile::ReadSkip(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (frame_header_.segmentation.segment_id_pre_skip &&
+ frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureSkip)) {
+ bp.skip = true;
+ return;
+ }
+ int context = 0;
+ if (block.top_available[kPlaneY] && block.bp_top->skip) {
+ ++context;
+ }
+ if (block.left_available[kPlaneY] && block.bp_left->skip) {
+ ++context;
+ }
+ uint16_t* const skip_cdf = symbol_decoder_context_.skip_cdf[context];
+ bp.skip = reader_.ReadSymbol(skip_cdf);
+}
+
+void Tile::ReadSkipMode(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (!frame_header_.skip_mode_present ||
+ frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureSkip) ||
+ frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureReferenceFrame) ||
+ frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureGlobalMv) ||
+ IsBlockDimension4(block.size)) {
+ bp.skip_mode = false;
+ return;
+ }
+ const int context =
+ (block.left_available[kPlaneY]
+ ? static_cast<int>(block.bp_left->skip_mode)
+ : 0) +
+ (block.top_available[kPlaneY] ? static_cast<int>(block.bp_top->skip_mode)
+ : 0);
+ bp.skip_mode =
+ reader_.ReadSymbol(symbol_decoder_context_.skip_mode_cdf[context]);
+}
+
+void Tile::ReadCdef(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (bp.skip || frame_header_.coded_lossless ||
+ !sequence_header_.enable_cdef || frame_header_.allow_intrabc) {
+ return;
+ }
+ const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
+ const int cdef_mask4x4 = ~(cdef_size4x4 - 1);
+ const int row4x4 = block.row4x4 & cdef_mask4x4;
+ const int column4x4 = block.column4x4 & cdef_mask4x4;
+ const int row = DivideBy16(row4x4);
+ const int column = DivideBy16(column4x4);
+ if (cdef_index_[row][column] == -1) {
+ cdef_index_[row][column] =
+ frame_header_.cdef.bits > 0
+ ? static_cast<int16_t>(reader_.ReadLiteral(frame_header_.cdef.bits))
+ : 0;
+ for (int i = row4x4; i < row4x4 + block.height4x4; i += cdef_size4x4) {
+ for (int j = column4x4; j < column4x4 + block.width4x4;
+ j += cdef_size4x4) {
+ cdef_index_[DivideBy16(i)][DivideBy16(j)] = cdef_index_[row][column];
+ }
+ }
+ }
+}
+
+int Tile::ReadAndClipDelta(uint16_t* const cdf, int delta_small, int scale,
+ int min_value, int max_value, int value) {
+ int abs = reader_.ReadSymbol<kDeltaSymbolCount>(cdf);
+ if (abs == delta_small) {
+ const int remaining_bit_count =
+ static_cast<int>(reader_.ReadLiteral(3)) + 1;
+ const int abs_remaining_bits =
+ static_cast<int>(reader_.ReadLiteral(remaining_bit_count));
+ abs = abs_remaining_bits + (1 << remaining_bit_count) + 1;
+ }
+ if (abs != 0) {
+ const bool sign = static_cast<bool>(reader_.ReadBit());
+ const int scaled_abs = abs << scale;
+ const int reduced_delta = sign ? -scaled_abs : scaled_abs;
+ value += reduced_delta;
+ value = Clip3(value, min_value, max_value);
+ }
+ return value;
+}
+
+void Tile::ReadQuantizerIndexDelta(const Block& block) {
+ assert(read_deltas_);
+ BlockParameters& bp = *block.bp;
+ if ((block.size == SuperBlockSize() && bp.skip)) {
+ return;
+ }
+ current_quantizer_index_ =
+ ReadAndClipDelta(symbol_decoder_context_.delta_q_cdf, kDeltaQSmall,
+ frame_header_.delta_q.scale, kMinLossyQuantizer,
+ kMaxQuantizer, current_quantizer_index_);
+}
+
+void Tile::ReadLoopFilterDelta(const Block& block) {
+ assert(read_deltas_);
+ BlockParameters& bp = *block.bp;
+ if (!frame_header_.delta_lf.present ||
+ (block.size == SuperBlockSize() && bp.skip)) {
+ return;
+ }
+ int frame_lf_count = 1;
+ if (frame_header_.delta_lf.multi) {
+ frame_lf_count = kFrameLfCount - (PlaneCount() > 1 ? 0 : 2);
+ }
+ bool recompute_deblock_filter_levels = false;
+ for (int i = 0; i < frame_lf_count; ++i) {
+ uint16_t* const delta_lf_abs_cdf =
+ frame_header_.delta_lf.multi
+ ? symbol_decoder_context_.delta_lf_multi_cdf[i]
+ : symbol_decoder_context_.delta_lf_cdf;
+ const int8_t old_delta_lf = delta_lf_[i];
+ delta_lf_[i] = ReadAndClipDelta(
+ delta_lf_abs_cdf, kDeltaLfSmall, frame_header_.delta_lf.scale,
+ -kMaxLoopFilterValue, kMaxLoopFilterValue, delta_lf_[i]);
+ recompute_deblock_filter_levels =
+ recompute_deblock_filter_levels || (old_delta_lf != delta_lf_[i]);
+ }
+ delta_lf_all_zero_ =
+ (delta_lf_[0] | delta_lf_[1] | delta_lf_[2] | delta_lf_[3]) == 0;
+ if (!delta_lf_all_zero_ && recompute_deblock_filter_levels) {
+ post_filter_.ComputeDeblockFilterLevels(delta_lf_, deblock_filter_levels_);
+ }
+}
+
+void Tile::ReadPredictionModeY(const Block& block, bool intra_y_mode) {
+ uint16_t* cdf;
+ if (intra_y_mode) {
+ const PredictionMode top_mode =
+ block.top_available[kPlaneY] ? block.bp_top->y_mode : kPredictionModeDc;
+ const PredictionMode left_mode = block.left_available[kPlaneY]
+ ? block.bp_left->y_mode
+ : kPredictionModeDc;
+ const int top_context = kIntraYModeContext[top_mode];
+ const int left_context = kIntraYModeContext[left_mode];
+ cdf = symbol_decoder_context_
+ .intra_frame_y_mode_cdf[top_context][left_context];
+ } else {
+ cdf = symbol_decoder_context_.y_mode_cdf[kSizeGroup[block.size]];
+ }
+ block.bp->y_mode = static_cast<PredictionMode>(
+ reader_.ReadSymbol<kIntraPredictionModesY>(cdf));
+}
+
+void Tile::ReadIntraAngleInfo(const Block& block, PlaneType plane_type) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.angle_delta[plane_type] = 0;
+ const PredictionMode mode =
+ (plane_type == kPlaneTypeY) ? bp.y_mode : bp.uv_mode;
+ if (IsBlockSmallerThan8x8(block.size) || !IsDirectionalMode(mode)) return;
+ uint16_t* const cdf =
+ symbol_decoder_context_.angle_delta_cdf[mode - kPredictionModeVertical];
+ prediction_parameters.angle_delta[plane_type] =
+ reader_.ReadSymbol<kAngleDeltaSymbolCount>(cdf);
+ prediction_parameters.angle_delta[plane_type] -= kMaxAngleDelta;
+}
+
+void Tile::ReadCflAlpha(const Block& block) {
+ const int signs = reader_.ReadSymbol<kCflAlphaSignsSymbolCount>(
+ symbol_decoder_context_.cfl_alpha_signs_cdf);
+ const int8_t* const cfl_lookup = kCflAlphaLookup[signs];
+ const auto sign_u = static_cast<CflSign>(cfl_lookup[0]);
+ const auto sign_v = static_cast<CflSign>(cfl_lookup[1]);
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.cfl_alpha_u = 0;
+ if (sign_u != kCflSignZero) {
+ assert(cfl_lookup[2] >= 0);
+ prediction_parameters.cfl_alpha_u =
+ reader_.ReadSymbol<kCflAlphaSymbolCount>(
+ symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[2]]) +
+ 1;
+ if (sign_u == kCflSignNegative) prediction_parameters.cfl_alpha_u *= -1;
+ }
+ prediction_parameters.cfl_alpha_v = 0;
+ if (sign_v != kCflSignZero) {
+ assert(cfl_lookup[3] >= 0);
+ prediction_parameters.cfl_alpha_v =
+ reader_.ReadSymbol<kCflAlphaSymbolCount>(
+ symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[3]]) +
+ 1;
+ if (sign_v == kCflSignNegative) prediction_parameters.cfl_alpha_v *= -1;
+ }
+}
+
+void Tile::ReadPredictionModeUV(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ bool chroma_from_luma_allowed;
+ if (frame_header_.segmentation.lossless[bp.segment_id]) {
+ chroma_from_luma_allowed = block.residual_size[kPlaneU] == kBlock4x4;
+ } else {
+ chroma_from_luma_allowed = IsBlockDimensionLessThan64(block.size);
+ }
+ uint16_t* const cdf =
+ symbol_decoder_context_
+ .uv_mode_cdf[static_cast<int>(chroma_from_luma_allowed)][bp.y_mode];
+ if (chroma_from_luma_allowed) {
+ bp.uv_mode = static_cast<PredictionMode>(
+ reader_.ReadSymbol<kIntraPredictionModesUV>(cdf));
+ } else {
+ bp.uv_mode = static_cast<PredictionMode>(
+ reader_.ReadSymbol<kIntraPredictionModesUV - 1>(cdf));
+ }
+}
+
+int Tile::ReadMotionVectorComponent(const Block& block, const int component) {
+ const int context =
+ static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+ const bool sign = reader_.ReadSymbol(
+ symbol_decoder_context_.mv_sign_cdf[component][context]);
+ const int mv_class = reader_.ReadSymbol<kMvClassSymbolCount>(
+ symbol_decoder_context_.mv_class_cdf[component][context]);
+ int magnitude = 1;
+ int value;
+ uint16_t* fraction_cdf;
+ uint16_t* precision_cdf;
+ if (mv_class == 0) {
+ value = static_cast<int>(reader_.ReadSymbol(
+ symbol_decoder_context_.mv_class0_bit_cdf[component][context]));
+ fraction_cdf = symbol_decoder_context_
+ .mv_class0_fraction_cdf[component][context][value];
+ precision_cdf = symbol_decoder_context_
+ .mv_class0_high_precision_cdf[component][context];
+ } else {
+ assert(mv_class <= kMvBitSymbolCount);
+ value = 0;
+ for (int i = 0; i < mv_class; ++i) {
+ const int bit = static_cast<int>(reader_.ReadSymbol(
+ symbol_decoder_context_.mv_bit_cdf[component][context][i]));
+ value |= bit << i;
+ }
+ magnitude += 2 << (mv_class + 2);
+ fraction_cdf = symbol_decoder_context_.mv_fraction_cdf[component][context];
+ precision_cdf =
+ symbol_decoder_context_.mv_high_precision_cdf[component][context];
+ }
+ const int fraction =
+ (frame_header_.force_integer_mv == 0)
+ ? reader_.ReadSymbol<kMvFractionSymbolCount>(fraction_cdf)
+ : 3;
+ const int precision =
+ frame_header_.allow_high_precision_mv
+ ? static_cast<int>(reader_.ReadSymbol(precision_cdf))
+ : 1;
+ magnitude += (value << 3) | (fraction << 1) | precision;
+ return sign ? -magnitude : magnitude;
+}
+
+void Tile::ReadMotionVector(const Block& block, int index) {
+ BlockParameters& bp = *block.bp;
+ const int context =
+ static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+ const auto mv_joint =
+ static_cast<MvJointType>(reader_.ReadSymbol<kNumMvJointTypes>(
+ symbol_decoder_context_.mv_joint_cdf[context]));
+ if (mv_joint == kMvJointTypeHorizontalZeroVerticalNonZero ||
+ mv_joint == kMvJointTypeNonZero) {
+ bp.mv.mv[index].mv[0] = ReadMotionVectorComponent(block, 0);
+ }
+ if (mv_joint == kMvJointTypeHorizontalNonZeroVerticalZero ||
+ mv_joint == kMvJointTypeNonZero) {
+ bp.mv.mv[index].mv[1] = ReadMotionVectorComponent(block, 1);
+ }
+}
+
+void Tile::ReadFilterIntraModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.use_filter_intra = false;
+ if (!sequence_header_.enable_filter_intra || bp.y_mode != kPredictionModeDc ||
+ bp.palette_mode_info.size[kPlaneTypeY] != 0 ||
+ !IsBlockDimensionLessThan64(block.size)) {
+ return;
+ }
+ prediction_parameters.use_filter_intra = reader_.ReadSymbol(
+ symbol_decoder_context_.use_filter_intra_cdf[block.size]);
+ if (prediction_parameters.use_filter_intra) {
+ prediction_parameters.filter_intra_mode = static_cast<FilterIntraPredictor>(
+ reader_.ReadSymbol<kNumFilterIntraPredictors>(
+ symbol_decoder_context_.filter_intra_mode_cdf));
+ }
+}
+
+bool Tile::DecodeIntraModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ bp.skip = false;
+ if (frame_header_.segmentation.segment_id_pre_skip &&
+ !ReadIntraSegmentId(block)) {
+ return false;
+ }
+ bp.skip_mode = false;
+ ReadSkip(block);
+ if (!frame_header_.segmentation.segment_id_pre_skip &&
+ !ReadIntraSegmentId(block)) {
+ return false;
+ }
+ ReadCdef(block);
+ if (read_deltas_) {
+ ReadQuantizerIndexDelta(block);
+ ReadLoopFilterDelta(block);
+ read_deltas_ = false;
+ }
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.use_intra_block_copy = false;
+ if (frame_header_.allow_intrabc) {
+ prediction_parameters.use_intra_block_copy =
+ reader_.ReadSymbol(symbol_decoder_context_.intra_block_copy_cdf);
+ }
+ if (prediction_parameters.use_intra_block_copy) {
+ bp.is_inter = true;
+ bp.reference_frame[0] = kReferenceFrameIntra;
+ bp.reference_frame[1] = kReferenceFrameNone;
+ bp.y_mode = kPredictionModeDc;
+ bp.uv_mode = kPredictionModeDc;
+ prediction_parameters.motion_mode = kMotionModeSimple;
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeAverage;
+ bp.palette_mode_info.size[kPlaneTypeY] = 0;
+ bp.palette_mode_info.size[kPlaneTypeUV] = 0;
+ bp.interpolation_filter[0] = kInterpolationFilterBilinear;
+ bp.interpolation_filter[1] = kInterpolationFilterBilinear;
+ MvContexts dummy_mode_contexts;
+ FindMvStack(block, /*is_compound=*/false, &dummy_mode_contexts);
+ return AssignIntraMv(block);
+ }
+ bp.is_inter = false;
+ return ReadIntraBlockModeInfo(block, /*intra_y_mode=*/true);
+}
+
+int8_t Tile::ComputePredictedSegmentId(const Block& block) const {
+ // If prev_segment_ids_ is null, treat it as if it pointed to a segmentation
+ // map containing all 0s.
+ if (prev_segment_ids_ == nullptr) return 0;
+
+ const int x_limit = std::min(frame_header_.columns4x4 - block.column4x4,
+ static_cast<int>(block.width4x4));
+ const int y_limit = std::min(frame_header_.rows4x4 - block.row4x4,
+ static_cast<int>(block.height4x4));
+ int8_t id = 7;
+ for (int y = 0; y < y_limit; ++y) {
+ for (int x = 0; x < x_limit; ++x) {
+ const int8_t prev_segment_id =
+ prev_segment_ids_->segment_id(block.row4x4 + y, block.column4x4 + x);
+ id = std::min(id, prev_segment_id);
+ }
+ }
+ return id;
+}
+
+bool Tile::ReadInterSegmentId(const Block& block, bool pre_skip) {
+ BlockParameters& bp = *block.bp;
+ if (!frame_header_.segmentation.enabled) {
+ bp.segment_id = 0;
+ return true;
+ }
+ if (!frame_header_.segmentation.update_map) {
+ bp.segment_id = ComputePredictedSegmentId(block);
+ return true;
+ }
+ if (pre_skip) {
+ if (!frame_header_.segmentation.segment_id_pre_skip) {
+ bp.segment_id = 0;
+ return true;
+ }
+ } else if (bp.skip) {
+ bp.use_predicted_segment_id = false;
+ return ReadSegmentId(block);
+ }
+ if (frame_header_.segmentation.temporal_update) {
+ const int context =
+ (block.left_available[kPlaneY]
+ ? static_cast<int>(block.bp_left->use_predicted_segment_id)
+ : 0) +
+ (block.top_available[kPlaneY]
+ ? static_cast<int>(block.bp_top->use_predicted_segment_id)
+ : 0);
+ bp.use_predicted_segment_id = reader_.ReadSymbol(
+ symbol_decoder_context_.use_predicted_segment_id_cdf[context]);
+ if (bp.use_predicted_segment_id) {
+ bp.segment_id = ComputePredictedSegmentId(block);
+ return true;
+ }
+ }
+ return ReadSegmentId(block);
+}
+
+void Tile::ReadIsInter(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (bp.skip_mode) {
+ bp.is_inter = true;
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureReferenceFrame)) {
+ bp.is_inter =
+ frame_header_.segmentation
+ .feature_data[bp.segment_id][kSegmentFeatureReferenceFrame] !=
+ kReferenceFrameIntra;
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureGlobalMv)) {
+ bp.is_inter = true;
+ return;
+ }
+ int context = 0;
+ if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+ context = (block.IsTopIntra() && block.IsLeftIntra())
+ ? 3
+ : static_cast<int>(block.IsTopIntra() || block.IsLeftIntra());
+ } else if (block.top_available[kPlaneY] || block.left_available[kPlaneY]) {
+ context = 2 * static_cast<int>(block.top_available[kPlaneY]
+ ? block.IsTopIntra()
+ : block.IsLeftIntra());
+ }
+ bp.is_inter =
+ reader_.ReadSymbol(symbol_decoder_context_.is_inter_cdf[context]);
+}
+
+bool Tile::ReadIntraBlockModeInfo(const Block& block, bool intra_y_mode) {
+ BlockParameters& bp = *block.bp;
+ bp.reference_frame[0] = kReferenceFrameIntra;
+ bp.reference_frame[1] = kReferenceFrameNone;
+ ReadPredictionModeY(block, intra_y_mode);
+ ReadIntraAngleInfo(block, kPlaneTypeY);
+ if (block.HasChroma()) {
+ ReadPredictionModeUV(block);
+ if (bp.uv_mode == kPredictionModeChromaFromLuma) {
+ ReadCflAlpha(block);
+ }
+ ReadIntraAngleInfo(block, kPlaneTypeUV);
+ }
+ ReadPaletteModeInfo(block);
+ ReadFilterIntraModeInfo(block);
+ return true;
+}
+
+CompoundReferenceType Tile::ReadCompoundReferenceType(const Block& block) {
+ // compound and inter.
+ const bool top_comp_inter = block.top_available[kPlaneY] &&
+ !block.IsTopIntra() && !block.IsTopSingle();
+ const bool left_comp_inter = block.left_available[kPlaneY] &&
+ !block.IsLeftIntra() && !block.IsLeftSingle();
+ // unidirectional compound.
+ const bool top_uni_comp =
+ top_comp_inter && IsSameDirectionReferencePair(block.TopReference(0),
+ block.TopReference(1));
+ const bool left_uni_comp =
+ left_comp_inter && IsSameDirectionReferencePair(block.LeftReference(0),
+ block.LeftReference(1));
+ int context;
+ if (block.top_available[kPlaneY] && !block.IsTopIntra() &&
+ block.left_available[kPlaneY] && !block.IsLeftIntra()) {
+ const int same_direction = static_cast<int>(IsSameDirectionReferencePair(
+ block.TopReference(0), block.LeftReference(0)));
+ if (!top_comp_inter && !left_comp_inter) {
+ context = 1 + MultiplyBy2(same_direction);
+ } else if (!top_comp_inter) {
+ context = left_uni_comp ? 3 + same_direction : 1;
+ } else if (!left_comp_inter) {
+ context = top_uni_comp ? 3 + same_direction : 1;
+ } else {
+ if (!top_uni_comp && !left_uni_comp) {
+ context = 0;
+ } else if (!top_uni_comp || !left_uni_comp) {
+ context = 2;
+ } else {
+ context = 3 + static_cast<int>(
+ (block.TopReference(0) == kReferenceFrameBackward) ==
+ (block.LeftReference(0) == kReferenceFrameBackward));
+ }
+ }
+ } else if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+ if (top_comp_inter) {
+ context = 1 + MultiplyBy2(static_cast<int>(top_uni_comp));
+ } else if (left_comp_inter) {
+ context = 1 + MultiplyBy2(static_cast<int>(left_uni_comp));
+ } else {
+ context = 2;
+ }
+ } else if (top_comp_inter) {
+ context = MultiplyBy4(static_cast<int>(top_uni_comp));
+ } else if (left_comp_inter) {
+ context = MultiplyBy4(static_cast<int>(left_uni_comp));
+ } else {
+ context = 2;
+ }
+ return static_cast<CompoundReferenceType>(reader_.ReadSymbol(
+ symbol_decoder_context_.compound_reference_type_cdf[context]));
+}
+
+template <bool is_single, bool is_backward, int index>
+uint16_t* Tile::GetReferenceCdf(
+ const Block& block,
+ CompoundReferenceType type /*= kNumCompoundReferenceTypes*/) {
+ int context = 0;
+ if ((type == kCompoundReferenceUnidirectional && index == 0) ||
+ (is_single && index == 1)) {
+ // uni_comp_ref and single_ref_p1.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameGolden,
+ kReferenceFrameBackward, kReferenceFrameAlternate);
+ } else if (type == kCompoundReferenceUnidirectional && index == 1) {
+ // uni_comp_ref_p1.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast2, kReferenceFrameLast2,
+ kReferenceFrameLast3, kReferenceFrameGolden);
+ } else if ((type == kCompoundReferenceUnidirectional && index == 2) ||
+ (type == kCompoundReferenceBidirectional && index == 2) ||
+ (is_single && index == 5)) {
+ // uni_comp_ref_p2, comp_ref_p2 and single_ref_p5.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast3, kReferenceFrameLast3,
+ kReferenceFrameGolden, kReferenceFrameGolden);
+ } else if ((type == kCompoundReferenceBidirectional && index == 0) ||
+ (is_single && index == 3)) {
+ // comp_ref and single_ref_p3.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast2,
+ kReferenceFrameLast3, kReferenceFrameGolden);
+ } else if ((type == kCompoundReferenceBidirectional && index == 1) ||
+ (is_single && index == 4)) {
+ // comp_ref_p1 and single_ref_p4.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast,
+ kReferenceFrameLast2, kReferenceFrameLast2);
+ } else if ((is_single && index == 2) || (is_backward && index == 0)) {
+ // single_ref_p2 and comp_bwdref.
+ context = GetReferenceContext(
+ block, kReferenceFrameBackward, kReferenceFrameAlternate2,
+ kReferenceFrameAlternate, kReferenceFrameAlternate);
+ } else if ((is_single && index == 6) || (is_backward && index == 1)) {
+ // single_ref_p6 and comp_bwdref_p1.
+ context = GetReferenceContext(
+ block, kReferenceFrameBackward, kReferenceFrameBackward,
+ kReferenceFrameAlternate2, kReferenceFrameAlternate2);
+ }
+ if (is_single) {
+ // The index parameter for single references is offset by one since the spec
+ // uses 1-based index for these elements.
+ return symbol_decoder_context_.single_reference_cdf[context][index - 1];
+ }
+ if (is_backward) {
+ return symbol_decoder_context_
+ .compound_backward_reference_cdf[context][index];
+ }
+ return symbol_decoder_context_.compound_reference_cdf[type][context][index];
+}
+
+void Tile::ReadReferenceFrames(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (bp.skip_mode) {
+ bp.reference_frame[0] = frame_header_.skip_mode_frame[0];
+ bp.reference_frame[1] = frame_header_.skip_mode_frame[1];
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureReferenceFrame)) {
+ bp.reference_frame[0] = static_cast<ReferenceFrameType>(
+ frame_header_.segmentation
+ .feature_data[bp.segment_id][kSegmentFeatureReferenceFrame]);
+ bp.reference_frame[1] = kReferenceFrameNone;
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureSkip) ||
+ frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureGlobalMv)) {
+ bp.reference_frame[0] = kReferenceFrameLast;
+ bp.reference_frame[1] = kReferenceFrameNone;
+ return;
+ }
+ const bool use_compound_reference =
+ frame_header_.reference_mode_select &&
+ std::min(block.width4x4, block.height4x4) >= 2 &&
+ reader_.ReadSymbol(symbol_decoder_context_.use_compound_reference_cdf
+ [GetUseCompoundReferenceContext(block)]);
+ if (use_compound_reference) {
+ CompoundReferenceType reference_type = ReadCompoundReferenceType(block);
+ if (reference_type == kCompoundReferenceUnidirectional) {
+ // uni_comp_ref.
+ if (reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 0>(block, reference_type))) {
+ bp.reference_frame[0] = kReferenceFrameBackward;
+ bp.reference_frame[1] = kReferenceFrameAlternate;
+ return;
+ }
+ // uni_comp_ref_p1.
+ if (!reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 1>(block, reference_type))) {
+ bp.reference_frame[0] = kReferenceFrameLast;
+ bp.reference_frame[1] = kReferenceFrameLast2;
+ return;
+ }
+ // uni_comp_ref_p2.
+ if (reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 2>(block, reference_type))) {
+ bp.reference_frame[0] = kReferenceFrameLast;
+ bp.reference_frame[1] = kReferenceFrameGolden;
+ return;
+ }
+ bp.reference_frame[0] = kReferenceFrameLast;
+ bp.reference_frame[1] = kReferenceFrameLast3;
+ return;
+ }
+ assert(reference_type == kCompoundReferenceBidirectional);
+ // comp_ref.
+ if (reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 0>(block, reference_type))) {
+ // comp_ref_p2.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 2>(block, reference_type))
+ ? kReferenceFrameGolden
+ : kReferenceFrameLast3;
+ } else {
+ // comp_ref_p1.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 1>(block, reference_type))
+ ? kReferenceFrameLast2
+ : kReferenceFrameLast;
+ }
+ // comp_bwdref.
+ if (reader_.ReadSymbol(GetReferenceCdf<false, true, 0>(block))) {
+ bp.reference_frame[1] = kReferenceFrameAlternate;
+ } else {
+ // comp_bwdref_p1.
+ bp.reference_frame[1] =
+ reader_.ReadSymbol(GetReferenceCdf<false, true, 1>(block))
+ ? kReferenceFrameAlternate2
+ : kReferenceFrameBackward;
+ }
+ return;
+ }
+ assert(!use_compound_reference);
+ bp.reference_frame[1] = kReferenceFrameNone;
+ // single_ref_p1.
+ if (reader_.ReadSymbol(GetReferenceCdf<true, false, 1>(block))) {
+ // single_ref_p2.
+ if (reader_.ReadSymbol(GetReferenceCdf<true, false, 2>(block))) {
+ bp.reference_frame[0] = kReferenceFrameAlternate;
+ return;
+ }
+ // single_ref_p6.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(GetReferenceCdf<true, false, 6>(block))
+ ? kReferenceFrameAlternate2
+ : kReferenceFrameBackward;
+ return;
+ }
+ // single_ref_p3.
+ if (reader_.ReadSymbol(GetReferenceCdf<true, false, 3>(block))) {
+ // single_ref_p5.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(GetReferenceCdf<true, false, 5>(block))
+ ? kReferenceFrameGolden
+ : kReferenceFrameLast3;
+ return;
+ }
+ // single_ref_p4.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(GetReferenceCdf<true, false, 4>(block))
+ ? kReferenceFrameLast2
+ : kReferenceFrameLast;
+}
+
+void Tile::ReadInterPredictionModeY(const Block& block,
+ const MvContexts& mode_contexts) {
+ BlockParameters& bp = *block.bp;
+ if (bp.skip_mode) {
+ bp.y_mode = kPredictionModeNearestNearestMv;
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureSkip) ||
+ frame_header_.segmentation.FeatureActive(bp.segment_id,
+ kSegmentFeatureGlobalMv)) {
+ bp.y_mode = kPredictionModeGlobalMv;
+ return;
+ }
+ if (bp.reference_frame[1] > kReferenceFrameIntra) {
+ const int idx0 = mode_contexts.reference_mv >> 1;
+ const int idx1 =
+ std::min(mode_contexts.new_mv, kCompoundModeNewMvContexts - 1);
+ const int context = kCompoundModeContextMap[idx0][idx1];
+ const int offset = reader_.ReadSymbol<kNumCompoundInterPredictionModes>(
+ symbol_decoder_context_.compound_prediction_mode_cdf[context]);
+ bp.y_mode =
+ static_cast<PredictionMode>(kPredictionModeNearestNearestMv + offset);
+ return;
+ }
+ // new_mv.
+ if (!reader_.ReadSymbol(
+ symbol_decoder_context_.new_mv_cdf[mode_contexts.new_mv])) {
+ bp.y_mode = kPredictionModeNewMv;
+ return;
+ }
+ // zero_mv.
+ if (!reader_.ReadSymbol(
+ symbol_decoder_context_.zero_mv_cdf[mode_contexts.zero_mv])) {
+ bp.y_mode = kPredictionModeGlobalMv;
+ return;
+ }
+ // ref_mv.
+ bp.y_mode =
+ reader_.ReadSymbol(
+ symbol_decoder_context_.reference_mv_cdf[mode_contexts.reference_mv])
+ ? kPredictionModeNearMv
+ : kPredictionModeNearestMv;
+}
+
+void Tile::ReadRefMvIndex(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.ref_mv_index = 0;
+ if (bp.y_mode != kPredictionModeNewMv &&
+ bp.y_mode != kPredictionModeNewNewMv &&
+ !kPredictionModeHasNearMvMask.Contains(bp.y_mode)) {
+ return;
+ }
+ const int start =
+ static_cast<int>(kPredictionModeHasNearMvMask.Contains(bp.y_mode));
+ prediction_parameters.ref_mv_index = start;
+ for (int i = start; i < start + 2; ++i) {
+ if (prediction_parameters.ref_mv_count <= i + 1) break;
+ // drl_mode in the spec.
+ const bool ref_mv_index_bit = reader_.ReadSymbol(
+ symbol_decoder_context_.ref_mv_index_cdf[GetRefMvIndexContext(
+ prediction_parameters.nearest_mv_count, i)]);
+ prediction_parameters.ref_mv_index = i + static_cast<int>(ref_mv_index_bit);
+ if (!ref_mv_index_bit) return;
+ }
+}
+
+void Tile::ReadInterIntraMode(const Block& block, bool is_compound) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+ prediction_parameters.is_wedge_inter_intra = false;
+ if (bp.skip_mode || !sequence_header_.enable_interintra_compound ||
+ is_compound || !kIsInterIntraModeAllowedMask.Contains(block.size)) {
+ return;
+ }
+ // kSizeGroup[block.size] is guaranteed to be non-zero because of the block
+ // size constraint enforced in the above condition.
+ assert(kSizeGroup[block.size] - 1 >= 0);
+ if (!reader_.ReadSymbol(
+ symbol_decoder_context_
+ .is_inter_intra_cdf[kSizeGroup[block.size] - 1])) {
+ prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+ return;
+ }
+ prediction_parameters.inter_intra_mode =
+ static_cast<InterIntraMode>(reader_.ReadSymbol<kNumInterIntraModes>(
+ symbol_decoder_context_
+ .inter_intra_mode_cdf[kSizeGroup[block.size] - 1]));
+ bp.reference_frame[1] = kReferenceFrameIntra;
+ prediction_parameters.angle_delta[kPlaneTypeY] = 0;
+ prediction_parameters.angle_delta[kPlaneTypeUV] = 0;
+ prediction_parameters.use_filter_intra = false;
+ prediction_parameters.is_wedge_inter_intra = reader_.ReadSymbol(
+ symbol_decoder_context_.is_wedge_inter_intra_cdf[block.size]);
+ if (!prediction_parameters.is_wedge_inter_intra) return;
+ prediction_parameters.wedge_index =
+ reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+ symbol_decoder_context_.wedge_index_cdf[block.size]);
+ prediction_parameters.wedge_sign = 0;
+}
+
+void Tile::ReadMotionMode(const Block& block, bool is_compound) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ const auto global_motion_type =
+ frame_header_.global_motion[bp.reference_frame[0]].type;
+ if (bp.skip_mode || !frame_header_.is_motion_mode_switchable ||
+ IsBlockDimension4(block.size) ||
+ (frame_header_.force_integer_mv == 0 &&
+ (bp.y_mode == kPredictionModeGlobalMv ||
+ bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+ global_motion_type > kGlobalMotionTransformationTypeTranslation) ||
+ is_compound || bp.reference_frame[1] == kReferenceFrameIntra ||
+ !block.HasOverlappableCandidates()) {
+ prediction_parameters.motion_mode = kMotionModeSimple;
+ return;
+ }
+ prediction_parameters.num_warp_samples = 0;
+ int num_samples_scanned = 0;
+ memset(prediction_parameters.warp_estimate_candidates, 0,
+ sizeof(prediction_parameters.warp_estimate_candidates));
+ FindWarpSamples(block, &prediction_parameters.num_warp_samples,
+ &num_samples_scanned,
+ prediction_parameters.warp_estimate_candidates);
+ if (frame_header_.force_integer_mv != 0 ||
+ prediction_parameters.num_warp_samples == 0 ||
+ !frame_header_.allow_warped_motion || IsScaled(bp.reference_frame[0])) {
+ prediction_parameters.motion_mode =
+ reader_.ReadSymbol(symbol_decoder_context_.use_obmc_cdf[block.size])
+ ? kMotionModeObmc
+ : kMotionModeSimple;
+ return;
+ }
+ prediction_parameters.motion_mode =
+ static_cast<MotionMode>(reader_.ReadSymbol<kNumMotionModes>(
+ symbol_decoder_context_.motion_mode_cdf[block.size]));
+}
+
+uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) {
+ int context = 0;
+ if (block.top_available[kPlaneY]) {
+ if (!block.IsTopSingle()) {
+ context += static_cast<int>(block.bp_top->is_explicit_compound_type);
+ } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+ context += 3;
+ }
+ }
+ if (block.left_available[kPlaneY]) {
+ if (!block.IsLeftSingle()) {
+ context += static_cast<int>(block.bp_left->is_explicit_compound_type);
+ } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+ context += 3;
+ }
+ }
+ return symbol_decoder_context_.is_explicit_compound_type_cdf[std::min(
+ context, kIsExplicitCompoundTypeContexts - 1)];
+}
+
+uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) {
+ const BlockParameters& bp = *block.bp;
+ const ReferenceInfo& reference_info = *current_frame_.reference_info();
+ const int forward =
+ std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]);
+ const int backward =
+ std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]);
+ int context = (forward == backward) ? 3 : 0;
+ if (block.top_available[kPlaneY]) {
+ if (!block.IsTopSingle()) {
+ context += static_cast<int>(block.bp_top->is_compound_type_average);
+ } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+ ++context;
+ }
+ }
+ if (block.left_available[kPlaneY]) {
+ if (!block.IsLeftSingle()) {
+ context += static_cast<int>(block.bp_left->is_compound_type_average);
+ } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+ ++context;
+ }
+ }
+ return symbol_decoder_context_.is_compound_type_average_cdf[context];
+}
+
+void Tile::ReadCompoundType(const Block& block, bool is_compound) {
+ BlockParameters& bp = *block.bp;
+ bp.is_explicit_compound_type = false;
+ bp.is_compound_type_average = true;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ if (bp.skip_mode) {
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeAverage;
+ return;
+ }
+ if (is_compound) {
+ if (sequence_header_.enable_masked_compound) {
+ bp.is_explicit_compound_type =
+ reader_.ReadSymbol(GetIsExplicitCompoundTypeCdf(block));
+ }
+ if (bp.is_explicit_compound_type) {
+ if (kIsWedgeCompoundModeAllowed.Contains(block.size)) {
+ // Only kCompoundPredictionTypeWedge and
+ // kCompoundPredictionTypeDiffWeighted are signaled explicitly.
+ prediction_parameters.compound_prediction_type =
+ static_cast<CompoundPredictionType>(reader_.ReadSymbol(
+ symbol_decoder_context_.compound_type_cdf[block.size]));
+ } else {
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeDiffWeighted;
+ }
+ } else {
+ if (sequence_header_.enable_jnt_comp) {
+ bp.is_compound_type_average =
+ reader_.ReadSymbol(GetIsCompoundTypeAverageCdf(block));
+ prediction_parameters.compound_prediction_type =
+ bp.is_compound_type_average ? kCompoundPredictionTypeAverage
+ : kCompoundPredictionTypeDistance;
+ } else {
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeAverage;
+ return;
+ }
+ }
+ if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeWedge) {
+ prediction_parameters.wedge_index =
+ reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+ symbol_decoder_context_.wedge_index_cdf[block.size]);
+ prediction_parameters.wedge_sign = static_cast<int>(reader_.ReadBit());
+ } else if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeDiffWeighted) {
+ prediction_parameters.mask_is_inverse =
+ static_cast<bool>(reader_.ReadBit());
+ }
+ return;
+ }
+ if (prediction_parameters.inter_intra_mode != kNumInterIntraModes) {
+ prediction_parameters.compound_prediction_type =
+ prediction_parameters.is_wedge_inter_intra
+ ? kCompoundPredictionTypeWedge
+ : kCompoundPredictionTypeIntra;
+ return;
+ }
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeAverage;
+}
+
+uint16_t* Tile::GetInterpolationFilterCdf(const Block& block, int direction) {
+ const BlockParameters& bp = *block.bp;
+ int context = MultiplyBy8(direction) +
+ MultiplyBy4(static_cast<int>(bp.reference_frame[1] >
+ kReferenceFrameIntra));
+ int top_type = kNumExplicitInterpolationFilters;
+ if (block.top_available[kPlaneY]) {
+ if (block.bp_top->reference_frame[0] == bp.reference_frame[0] ||
+ block.bp_top->reference_frame[1] == bp.reference_frame[0]) {
+ top_type = block.bp_top->interpolation_filter[direction];
+ }
+ }
+ int left_type = kNumExplicitInterpolationFilters;
+ if (block.left_available[kPlaneY]) {
+ if (block.bp_left->reference_frame[0] == bp.reference_frame[0] ||
+ block.bp_left->reference_frame[1] == bp.reference_frame[0]) {
+ left_type = block.bp_left->interpolation_filter[direction];
+ }
+ }
+ if (left_type == top_type) {
+ context += left_type;
+ } else if (left_type == kNumExplicitInterpolationFilters) {
+ context += top_type;
+ } else if (top_type == kNumExplicitInterpolationFilters) {
+ context += left_type;
+ } else {
+ context += kNumExplicitInterpolationFilters;
+ }
+ return symbol_decoder_context_.interpolation_filter_cdf[context];
+}
+
+void Tile::ReadInterpolationFilter(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (frame_header_.interpolation_filter != kInterpolationFilterSwitchable) {
+ static_assert(
+ sizeof(bp.interpolation_filter) / sizeof(bp.interpolation_filter[0]) ==
+ 2,
+ "Interpolation filter array size is not 2");
+ for (auto& interpolation_filter : bp.interpolation_filter) {
+ interpolation_filter = frame_header_.interpolation_filter;
+ }
+ return;
+ }
+ bool interpolation_filter_present = true;
+ if (bp.skip_mode ||
+ block.bp->prediction_parameters->motion_mode == kMotionModeLocalWarp) {
+ interpolation_filter_present = false;
+ } else if (!IsBlockDimension4(block.size) &&
+ bp.y_mode == kPredictionModeGlobalMv) {
+ interpolation_filter_present =
+ frame_header_.global_motion[bp.reference_frame[0]].type ==
+ kGlobalMotionTransformationTypeTranslation;
+ } else if (!IsBlockDimension4(block.size) &&
+ bp.y_mode == kPredictionModeGlobalGlobalMv) {
+ interpolation_filter_present =
+ frame_header_.global_motion[bp.reference_frame[0]].type ==
+ kGlobalMotionTransformationTypeTranslation ||
+ frame_header_.global_motion[bp.reference_frame[1]].type ==
+ kGlobalMotionTransformationTypeTranslation;
+ }
+ for (int i = 0; i < (sequence_header_.enable_dual_filter ? 2 : 1); ++i) {
+ bp.interpolation_filter[i] =
+ interpolation_filter_present
+ ? static_cast<InterpolationFilter>(
+ reader_.ReadSymbol<kNumExplicitInterpolationFilters>(
+ GetInterpolationFilterCdf(block, i)))
+ : kInterpolationFilterEightTap;
+ }
+ if (!sequence_header_.enable_dual_filter) {
+ bp.interpolation_filter[1] = bp.interpolation_filter[0];
+ }
+}
+
+bool Tile::ReadInterBlockModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ bp.palette_mode_info.size[kPlaneTypeY] = 0;
+ bp.palette_mode_info.size[kPlaneTypeUV] = 0;
+ ReadReferenceFrames(block);
+ const bool is_compound = bp.reference_frame[1] > kReferenceFrameIntra;
+ MvContexts mode_contexts;
+ FindMvStack(block, is_compound, &mode_contexts);
+ ReadInterPredictionModeY(block, mode_contexts);
+ ReadRefMvIndex(block);
+ if (!AssignInterMv(block, is_compound)) return false;
+ ReadInterIntraMode(block, is_compound);
+ ReadMotionMode(block, is_compound);
+ ReadCompoundType(block, is_compound);
+ ReadInterpolationFilter(block);
+ return true;
+}
+
+bool Tile::DecodeInterModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ block.bp->prediction_parameters->use_intra_block_copy = false;
+ bp.skip = false;
+ if (!ReadInterSegmentId(block, /*pre_skip=*/true)) return false;
+ ReadSkipMode(block);
+ if (bp.skip_mode) {
+ bp.skip = true;
+ } else {
+ ReadSkip(block);
+ }
+ if (!frame_header_.segmentation.segment_id_pre_skip &&
+ !ReadInterSegmentId(block, /*pre_skip=*/false)) {
+ return false;
+ }
+ ReadCdef(block);
+ if (read_deltas_) {
+ ReadQuantizerIndexDelta(block);
+ ReadLoopFilterDelta(block);
+ read_deltas_ = false;
+ }
+ ReadIsInter(block);
+ return bp.is_inter ? ReadInterBlockModeInfo(block)
+ : ReadIntraBlockModeInfo(block, /*intra_y_mode=*/false);
+}
+
+bool Tile::DecodeModeInfo(const Block& block) {
+ return IsIntraFrame(frame_header_.frame_type) ? DecodeIntraModeInfo(block)
+ : DecodeInterModeInfo(block);
+}
+
+} // namespace libgav1
diff --git a/src/tile/bitstream/palette.cc b/src/tile/bitstream/palette.cc
new file mode 100644
index 0000000..674d210
--- /dev/null
+++ b/src/tile/bitstream/palette.cc
@@ -0,0 +1,319 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+int Tile::GetPaletteCache(const Block& block, PlaneType plane_type,
+ uint16_t* const cache) {
+ const int top_size =
+ (block.top_available[kPlaneY] && Mod64(MultiplyBy4(block.row4x4)) != 0)
+ ? block.bp_top->palette_mode_info.size[plane_type]
+ : 0;
+ const int left_size = block.left_available[kPlaneY]
+ ? block.bp_left->palette_mode_info.size[plane_type]
+ : 0;
+ if (left_size == 0 && top_size == 0) return 0;
+ // Merge the left and top colors in sorted order and store them in |cache|.
+ uint16_t dummy[1];
+ const uint16_t* top = (top_size > 0)
+ ? block.bp_top->palette_mode_info.color[plane_type]
+ : dummy;
+ const uint16_t* left =
+ (left_size > 0) ? block.bp_left->palette_mode_info.color[plane_type]
+ : dummy;
+ std::merge(top, top + top_size, left, left + left_size, cache);
+ // Deduplicate the entries in |cache| and return the number of unique
+ // entries.
+ return static_cast<int>(
+ std::distance(cache, std::unique(cache, cache + left_size + top_size)));
+}
+
+void Tile::ReadPaletteColors(const Block& block, Plane plane) {
+ const PlaneType plane_type = GetPlaneType(plane);
+ uint16_t cache[2 * kMaxPaletteSize];
+ const int n = GetPaletteCache(block, plane_type, cache);
+ BlockParameters& bp = *block.bp;
+ const uint8_t palette_size = bp.palette_mode_info.size[plane_type];
+ uint16_t* const palette_color = bp.palette_mode_info.color[plane];
+ const int8_t bitdepth = sequence_header_.color_config.bitdepth;
+ int index = 0;
+ for (int i = 0; i < n && index < palette_size; ++i) {
+ if (reader_.ReadBit() != 0) { // use_palette_color_cache.
+ palette_color[index++] = cache[i];
+ }
+ }
+ const int merge_pivot = index;
+ if (index < palette_size) {
+ palette_color[index++] =
+ static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+ }
+ const int max_value = (1 << bitdepth) - 1;
+ if (index < palette_size) {
+ int bits = bitdepth - 3 + static_cast<int>(reader_.ReadLiteral(2));
+ do {
+ const int delta = static_cast<int>(reader_.ReadLiteral(bits)) +
+ (plane_type == kPlaneTypeY ? 1 : 0);
+ palette_color[index] =
+ std::min(palette_color[index - 1] + delta, max_value);
+ if (palette_color[index] + (plane_type == kPlaneTypeY ? 1 : 0) >=
+ max_value) {
+ // Once the color exceeds max_value, all others can be set to max_value
+ // (since they are computed as a delta on top of the current color and
+ // then clipped).
+ Memset(&palette_color[index + 1], max_value, palette_size - index - 1);
+ break;
+ }
+ const int range = (1 << bitdepth) - palette_color[index] -
+ (plane_type == kPlaneTypeY ? 1 : 0);
+ bits = std::min(bits, CeilLog2(range));
+ } while (++index < palette_size);
+ }
+ // Palette colors are generated using two ascending arrays. So sorting them is
+ // simply a matter of merging the two sorted portions of the array.
+ std::inplace_merge(palette_color, palette_color + merge_pivot,
+ palette_color + palette_size);
+ if (plane_type == kPlaneTypeUV) {
+ uint16_t* const palette_color_v = bp.palette_mode_info.color[kPlaneV];
+ if (reader_.ReadBit() != 0) { // delta_encode_palette_colors_v.
+ const int bits = bitdepth - 4 + static_cast<int>(reader_.ReadLiteral(2));
+ palette_color_v[0] = reader_.ReadLiteral(bitdepth);
+ for (int i = 1; i < palette_size; ++i) {
+ int delta = static_cast<int>(reader_.ReadLiteral(bits));
+ if (delta != 0 && reader_.ReadBit() != 0) delta = -delta;
+ // This line is equivalent to the following lines in the spec:
+ // val = palette_colors_v[ idx - 1 ] + palette_delta_v
+ // if ( val < 0 ) val += maxVal
+ // if ( val >= maxVal ) val -= maxVal
+ // palette_colors_v[ idx ] = Clip1( val )
+ //
+ // The difference is that in the code, max_value is (1 << bitdepth) - 1.
+ // So "& max_value" has the desired effect of computing both the "if"
+ // conditions and the Clip.
+ palette_color_v[i] = (palette_color_v[i - 1] + delta) & max_value;
+ }
+ } else {
+ for (int i = 0; i < palette_size; ++i) {
+ palette_color_v[i] =
+ static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+ }
+ }
+ }
+}
+
+void Tile::ReadPaletteModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
+ !frame_header_.allow_screen_content_tools) {
+ bp.palette_mode_info.size[kPlaneTypeY] = 0;
+ bp.palette_mode_info.size[kPlaneTypeUV] = 0;
+ return;
+ }
+ const int block_size_context =
+ k4x4WidthLog2[block.size] + k4x4HeightLog2[block.size] - 2;
+ if (bp.y_mode == kPredictionModeDc) {
+ const int context =
+ static_cast<int>(block.top_available[kPlaneY] &&
+ block.bp_top->palette_mode_info.size[kPlaneTypeY] >
+ 0) +
+ static_cast<int>(block.left_available[kPlaneY] &&
+ block.bp_left->palette_mode_info.size[kPlaneTypeY] >
+ 0);
+ const bool has_palette_y = reader_.ReadSymbol(
+ symbol_decoder_context_.has_palette_y_cdf[block_size_context][context]);
+ if (has_palette_y) {
+ bp.palette_mode_info.size[kPlaneTypeY] =
+ kMinPaletteSize +
+ reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+ symbol_decoder_context_.palette_y_size_cdf[block_size_context]);
+ ReadPaletteColors(block, kPlaneY);
+ }
+ }
+ if (bp.uv_mode == kPredictionModeDc && block.HasChroma()) {
+ const int context =
+ static_cast<int>(bp.palette_mode_info.size[kPlaneTypeY] > 0);
+ const bool has_palette_uv =
+ reader_.ReadSymbol(symbol_decoder_context_.has_palette_uv_cdf[context]);
+ if (has_palette_uv) {
+ bp.palette_mode_info.size[kPlaneTypeUV] =
+ kMinPaletteSize +
+ reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+ symbol_decoder_context_.palette_uv_size_cdf[block_size_context]);
+ ReadPaletteColors(block, kPlaneU);
+ }
+ }
+}
+
+void Tile::PopulatePaletteColorContexts(
+ const Block& block, PlaneType plane_type, int i, int start, int end,
+ uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+ uint8_t color_context[kMaxPaletteSquare]) {
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ for (int column = start, counter = 0; column >= end; --column, ++counter) {
+ const int row = i - column;
+ assert(row > 0 || column > 0);
+ const uint8_t top =
+ (row > 0)
+ ? prediction_parameters.color_index_map[plane_type][row - 1][column]
+ : 0;
+ const uint8_t left =
+ (column > 0)
+ ? prediction_parameters.color_index_map[plane_type][row][column - 1]
+ : 0;
+ uint8_t index_mask;
+ static_assert(kMaxPaletteSize <= 8, "");
+ int index;
+ if (column <= 0) {
+ color_context[counter] = 0;
+ color_order[counter][0] = top;
+ index_mask = 1 << top;
+ index = 1;
+ } else if (row <= 0) {
+ color_context[counter] = 0;
+ color_order[counter][0] = left;
+ index_mask = 1 << left;
+ index = 1;
+ } else {
+ const uint8_t top_left =
+ prediction_parameters
+ .color_index_map[plane_type][row - 1][column - 1];
+ index_mask = (1 << top) | (1 << left) | (1 << top_left);
+ if (top == left && top == top_left) {
+ color_context[counter] = 4;
+ color_order[counter][0] = top;
+ index = 1;
+ } else if (top == left) {
+ color_context[counter] = 3;
+ color_order[counter][0] = top;
+ color_order[counter][1] = top_left;
+ index = 2;
+ } else if (top == top_left) {
+ color_context[counter] = 2;
+ color_order[counter][0] = top_left;
+ color_order[counter][1] = left;
+ index = 2;
+ } else if (left == top_left) {
+ color_context[counter] = 2;
+ color_order[counter][0] = top_left;
+ color_order[counter][1] = top;
+ index = 2;
+ } else {
+ color_context[counter] = 1;
+ color_order[counter][0] = std::min(top, left);
+ color_order[counter][1] = std::max(top, left);
+ color_order[counter][2] = top_left;
+ index = 3;
+ }
+ }
+ // Even though only the first |palette_size| entries of this array are ever
+ // used, it is faster to populate all 8 because of the vectorization of the
+ // constant sized loop.
+ for (uint8_t j = 0; j < kMaxPaletteSize; ++j) {
+ if (BitMaskSet::MaskContainsValue(index_mask, j)) continue;
+ color_order[counter][index++] = j;
+ }
+ }
+}
+
+bool Tile::ReadPaletteTokens(const Block& block) {
+ const PaletteModeInfo& palette_mode_info = block.bp->palette_mode_info;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ for (int plane_type = kPlaneTypeY;
+ plane_type < (block.HasChroma() ? kNumPlaneTypes : kPlaneTypeUV);
+ ++plane_type) {
+ const int palette_size = palette_mode_info.size[plane_type];
+ if (palette_size == 0) continue;
+ int block_height = block.height;
+ int block_width = block.width;
+ int screen_height = std::min(
+ block_height, MultiplyBy4(frame_header_.rows4x4 - block.row4x4));
+ int screen_width = std::min(
+ block_width, MultiplyBy4(frame_header_.columns4x4 - block.column4x4));
+ if (plane_type == kPlaneTypeUV) {
+ block_height >>= sequence_header_.color_config.subsampling_y;
+ block_width >>= sequence_header_.color_config.subsampling_x;
+ screen_height >>= sequence_header_.color_config.subsampling_y;
+ screen_width >>= sequence_header_.color_config.subsampling_x;
+ if (block_height < 4) {
+ block_height += 2;
+ screen_height += 2;
+ }
+ if (block_width < 4) {
+ block_width += 2;
+ screen_width += 2;
+ }
+ }
+ if (!prediction_parameters.color_index_map[plane_type].Reset(
+ block_height, block_width, /*zero_initialize=*/false)) {
+ return false;
+ }
+ int first_value = 0;
+ reader_.DecodeUniform(palette_size, &first_value);
+ prediction_parameters.color_index_map[plane_type][0][0] = first_value;
+ for (int i = 1; i < screen_height + screen_width - 1; ++i) {
+ const int start = std::min(i, screen_width - 1);
+ const int end = std::max(0, i - screen_height + 1);
+ uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize];
+ uint8_t color_context[kMaxPaletteSquare];
+ PopulatePaletteColorContexts(block, static_cast<PlaneType>(plane_type), i,
+ start, end, color_order, color_context);
+ for (int j = start, counter = 0; j >= end; --j, ++counter) {
+ uint16_t* const cdf =
+ symbol_decoder_context_
+ .palette_color_index_cdf[plane_type]
+ [palette_size - kMinPaletteSize]
+ [color_context[counter]];
+ const int color_order_index = reader_.ReadSymbol(cdf, palette_size);
+ prediction_parameters.color_index_map[plane_type][i - j][j] =
+ color_order[counter][color_order_index];
+ }
+ }
+ if (screen_width < block_width) {
+ for (int i = 0; i < screen_height; ++i) {
+ memset(
+ &prediction_parameters.color_index_map[plane_type][i][screen_width],
+ prediction_parameters
+ .color_index_map[plane_type][i][screen_width - 1],
+ block_width - screen_width);
+ }
+ }
+ for (int i = screen_height; i < block_height; ++i) {
+ memcpy(
+ prediction_parameters.color_index_map[plane_type][i],
+ prediction_parameters.color_index_map[plane_type][screen_height - 1],
+ block_width);
+ }
+ }
+ return true;
+}
+
+} // namespace libgav1
diff --git a/src/tile/bitstream/partition.cc b/src/tile/bitstream/partition.cc
new file mode 100644
index 0000000..f3dbbb0
--- /dev/null
+++ b/src/tile/bitstream/partition.cc
@@ -0,0 +1,148 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+uint16_t PartitionCdfGatherHorizontalAlike(const uint16_t* const partition_cdf,
+ BlockSize block_size) {
+ // The spec computes the cdf value using the following formula (not writing
+ // partition_cdf[] and using short forms for partition names for clarity):
+ // cdf = None - H + V - S + S - HTS + HTS - HBS + HBS - VLS;
+ // if (block_size != 128x128) {
+ // cdf += VRS - H4;
+ // }
+ // After canceling out the repeated terms with opposite signs, we have:
+ // cdf = None - H + V - VLS;
+ // if (block_size != 128x128) {
+ // cdf += VRS - H4;
+ // }
+ uint16_t cdf = partition_cdf[kPartitionNone] -
+ partition_cdf[kPartitionHorizontal] +
+ partition_cdf[kPartitionVertical] -
+ partition_cdf[kPartitionVerticalWithLeftSplit];
+ if (block_size != kBlock128x128) {
+ cdf += partition_cdf[kPartitionVerticalWithRightSplit] -
+ partition_cdf[kPartitionHorizontal4];
+ }
+ return cdf;
+}
+
+uint16_t PartitionCdfGatherVerticalAlike(const uint16_t* const partition_cdf,
+ BlockSize block_size) {
+ // The spec computes the cdf value using the following formula (not writing
+ // partition_cdf[] and using short forms for partition names for clarity):
+ // cdf = H - V + V - S + HBS - VLS + VLS - VRS + S - HTS;
+ // if (block_size != 128x128) {
+ // cdf += H4 - V4;
+ // }
+ // V4 is always zero. So, after canceling out the repeated terms with opposite
+ // signs, we have:
+ // cdf = H + HBS - VRS - HTS;
+ // if (block_size != 128x128) {
+ // cdf += H4;
+ // }
+ // VRS is zero for 128x128 blocks. So, further simplifying we have:
+ // cdf = H + HBS - HTS;
+ // if (block_size != 128x128) {
+ // cdf += H4 - VRS;
+ // }
+ uint16_t cdf = partition_cdf[kPartitionHorizontal] +
+ partition_cdf[kPartitionHorizontalWithBottomSplit] -
+ partition_cdf[kPartitionHorizontalWithTopSplit];
+ if (block_size != kBlock128x128) {
+ cdf += partition_cdf[kPartitionHorizontal4] -
+ partition_cdf[kPartitionVerticalWithRightSplit];
+ }
+ return cdf;
+}
+
+} // namespace
+
+uint16_t* Tile::GetPartitionCdf(int row4x4, int column4x4,
+ BlockSize block_size) {
+ const int block_size_log2 = k4x4WidthLog2[block_size];
+ int top = 0;
+ if (IsTopInside(row4x4)) {
+ top = static_cast<int>(
+ k4x4WidthLog2[block_parameters_holder_.Find(row4x4 - 1, column4x4)
+ ->size] < block_size_log2);
+ }
+ int left = 0;
+ if (IsLeftInside(column4x4)) {
+ left = static_cast<int>(
+ k4x4HeightLog2[block_parameters_holder_.Find(row4x4, column4x4 - 1)
+ ->size] < block_size_log2);
+ }
+ const int context = left * 2 + top;
+ return symbol_decoder_context_.partition_cdf[block_size_log2 - 1][context];
+}
+
+bool Tile::ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+ bool has_rows, bool has_columns,
+ Partition* const partition) {
+ if (IsBlockSmallerThan8x8(block_size)) {
+ *partition = kPartitionNone;
+ return true;
+ }
+ if (!has_rows && !has_columns) {
+ *partition = kPartitionSplit;
+ return true;
+ }
+ uint16_t* const partition_cdf =
+ GetPartitionCdf(row4x4, column4x4, block_size);
+ if (partition_cdf == nullptr) {
+ return false;
+ }
+ if (has_rows && has_columns) {
+ const int bsize_log2 = k4x4WidthLog2[block_size];
+ // The partition block size should be 8x8 or above.
+ assert(bsize_log2 > 0);
+ if (bsize_log2 == 1) {
+ *partition = static_cast<Partition>(
+ reader_.ReadSymbol<kPartitionSplit + 1>(partition_cdf));
+ } else if (bsize_log2 == 5) {
+ *partition = static_cast<Partition>(
+ reader_.ReadSymbol<kPartitionVerticalWithRightSplit + 1>(
+ partition_cdf));
+ } else {
+ *partition = static_cast<Partition>(
+ reader_.ReadSymbol<kMaxPartitionTypes>(partition_cdf));
+ }
+ } else if (has_columns) {
+ const uint16_t cdf =
+ PartitionCdfGatherVerticalAlike(partition_cdf, block_size);
+ *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+ : kPartitionHorizontal;
+ } else {
+ const uint16_t cdf =
+ PartitionCdfGatherHorizontalAlike(partition_cdf, block_size);
+ *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+ : kPartitionVertical;
+ }
+ return true;
+}
+
+} // namespace libgav1
diff --git a/src/tile/bitstream/transform_size.cc b/src/tile/bitstream/transform_size.cc
new file mode 100644
index 0000000..b79851d
--- /dev/null
+++ b/src/tile/bitstream/transform_size.cc
@@ -0,0 +1,222 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t kMaxVariableTransformTreeDepth = 2;
+// Max_Tx_Depth array from section 5.11.5 in the spec with the following
+// modification: If the element is not zero, it is subtracted by one. That is
+// the only way in which this array is being used.
+constexpr int kTxDepthCdfIndex[kMaxBlockSizes] = {
+ 0, 0, 1, 0, 0, 1, 2, 1, 1, 1, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3};
+
+constexpr TransformSize kMaxTransformSizeRectangle[kMaxBlockSizes] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+ kTransformSize64x64};
+
+TransformSize GetSquareTransformSize(uint8_t pixels) {
+ switch (pixels) {
+ case 128:
+ case 64:
+ return kTransformSize64x64;
+ case 32:
+ return kTransformSize32x32;
+ case 16:
+ return kTransformSize16x16;
+ case 8:
+ return kTransformSize8x8;
+ default:
+ return kTransformSize4x4;
+ }
+}
+
+} // namespace
+
+int Tile::GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+ bool ignore_skip) {
+ if (row4x4 == block.row4x4) {
+ if (!block.top_available[kPlaneY]) return 64;
+ const BlockParameters& bp_top =
+ *block_parameters_holder_.Find(row4x4 - 1, column4x4);
+ if ((ignore_skip || bp_top.skip) && bp_top.is_inter) {
+ return kBlockWidthPixels[bp_top.size];
+ }
+ }
+ return kTransformWidth[inter_transform_sizes_[row4x4 - 1][column4x4]];
+}
+
+int Tile::GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+ bool ignore_skip) {
+ if (column4x4 == block.column4x4) {
+ if (!block.left_available[kPlaneY]) return 64;
+ const BlockParameters& bp_left =
+ *block_parameters_holder_.Find(row4x4, column4x4 - 1);
+ if ((ignore_skip || bp_left.skip) && bp_left.is_inter) {
+ return kBlockHeightPixels[bp_left.size];
+ }
+ }
+ return kTransformHeight[inter_transform_sizes_[row4x4][column4x4 - 1]];
+}
+
+TransformSize Tile::ReadFixedTransformSize(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (frame_header_.segmentation.lossless[bp.segment_id]) {
+ return kTransformSize4x4;
+ }
+ const TransformSize max_rect_tx_size = kMaxTransformSizeRectangle[block.size];
+ const bool allow_select = !bp.skip || !bp.is_inter;
+ if (block.size == kBlock4x4 || !allow_select ||
+ frame_header_.tx_mode != kTxModeSelect) {
+ return max_rect_tx_size;
+ }
+ const int max_tx_width = kTransformWidth[max_rect_tx_size];
+ const int max_tx_height = kTransformHeight[max_rect_tx_size];
+ const int top_width =
+ block.top_available[kPlaneY]
+ ? GetTopTransformWidth(block, block.row4x4, block.column4x4, true)
+ : 0;
+ const int left_height =
+ block.left_available[kPlaneY]
+ ? GetLeftTransformHeight(block, block.row4x4, block.column4x4, true)
+ : 0;
+ const auto context = static_cast<int>(top_width >= max_tx_width) +
+ static_cast<int>(left_height >= max_tx_height);
+ const int cdf_index = kTxDepthCdfIndex[block.size];
+ uint16_t* const cdf =
+ symbol_decoder_context_.tx_depth_cdf[cdf_index][context];
+ const int tx_depth = (cdf_index == 0)
+ ? static_cast<int>(reader_.ReadSymbol(cdf))
+ : reader_.ReadSymbol<3>(cdf);
+ assert(tx_depth < 3);
+ TransformSize tx_size = max_rect_tx_size;
+ if (tx_depth == 0) return tx_size;
+ tx_size = kSplitTransformSize[tx_size];
+ if (tx_depth == 1) return tx_size;
+ return kSplitTransformSize[tx_size];
+}
+
+void Tile::ReadVariableTransformTree(const Block& block, int row4x4,
+ int column4x4, TransformSize tx_size) {
+ const uint8_t pixels = std::max(block.width, block.height);
+ const TransformSize max_tx_size = GetSquareTransformSize(pixels);
+ const int context_delta = (kNumSquareTransformSizes - 1 -
+ TransformSizeToSquareTransformIndex(max_tx_size)) *
+ 6;
+
+ // Branching factor is 4 and maximum depth is 2. So the maximum stack size
+ // necessary is (4 - 1) + 4 = 7.
+ Stack<TransformTreeNode, 7> stack;
+ stack.Push(TransformTreeNode(column4x4, row4x4, tx_size, 0));
+
+ do {
+ TransformTreeNode node = stack.Pop();
+ const int tx_width4x4 = kTransformWidth4x4[node.tx_size];
+ const int tx_height4x4 = kTransformHeight4x4[node.tx_size];
+ if (node.tx_size != kTransformSize4x4 &&
+ node.depth != kMaxVariableTransformTreeDepth) {
+ const auto top =
+ static_cast<int>(GetTopTransformWidth(block, node.y, node.x, false) <
+ kTransformWidth[node.tx_size]);
+ const auto left = static_cast<int>(
+ GetLeftTransformHeight(block, node.y, node.x, false) <
+ kTransformHeight[node.tx_size]);
+ const int context =
+ static_cast<int>(max_tx_size > kTransformSize8x8 &&
+ kTransformSizeSquareMax[node.tx_size] !=
+ max_tx_size) *
+ 3 +
+ context_delta + top + left;
+ // tx_split.
+ if (reader_.ReadSymbol(symbol_decoder_context_.tx_split_cdf[context])) {
+ const TransformSize sub_tx_size = kSplitTransformSize[node.tx_size];
+ const int step_width4x4 = kTransformWidth4x4[sub_tx_size];
+ const int step_height4x4 = kTransformHeight4x4[sub_tx_size];
+ // The loops have to run in reverse order because we use a stack for
+ // DFS.
+ for (int i = tx_height4x4 - step_height4x4; i >= 0;
+ i -= step_height4x4) {
+ for (int j = tx_width4x4 - step_width4x4; j >= 0;
+ j -= step_width4x4) {
+ if (node.y + i >= frame_header_.rows4x4 ||
+ node.x + j >= frame_header_.columns4x4) {
+ continue;
+ }
+ stack.Push(TransformTreeNode(node.x + j, node.y + i, sub_tx_size,
+ node.depth + 1));
+ }
+ }
+ continue;
+ }
+ }
+ // tx_split is false.
+ for (int i = 0; i < tx_height4x4; ++i) {
+ static_assert(sizeof(TransformSize) == 1, "");
+ memset(&inter_transform_sizes_[node.y + i][node.x], node.tx_size,
+ tx_width4x4);
+ }
+ block_parameters_holder_.Find(node.y, node.x)->transform_size =
+ node.tx_size;
+ } while (!stack.Empty());
+}
+
+void Tile::DecodeTransformSize(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (frame_header_.tx_mode == kTxModeSelect && block.size > kBlock4x4 &&
+ bp.is_inter && !bp.skip &&
+ !frame_header_.segmentation.lossless[bp.segment_id]) {
+ const TransformSize max_tx_size = kMaxTransformSizeRectangle[block.size];
+ const int tx_width4x4 = kTransformWidth4x4[max_tx_size];
+ const int tx_height4x4 = kTransformHeight4x4[max_tx_size];
+ for (int row = block.row4x4; row < block.row4x4 + block.height4x4;
+ row += tx_height4x4) {
+ for (int column = block.column4x4;
+ column < block.column4x4 + block.width4x4; column += tx_width4x4) {
+ ReadVariableTransformTree(block, row, column, max_tx_size);
+ }
+ }
+ } else {
+ bp.transform_size = ReadFixedTransformSize(block);
+ for (int row = block.row4x4; row < block.row4x4 + block.height4x4; ++row) {
+ static_assert(sizeof(TransformSize) == 1, "");
+ memset(&inter_transform_sizes_[row][block.column4x4], bp.transform_size,
+ block.width4x4);
+ }
+ }
+}
+
+} // namespace libgav1
diff --git a/src/tile/prediction.cc b/src/tile/prediction.cc
new file mode 100644
index 0000000..c5560a6
--- /dev/null
+++ b/src/tile/prediction.cc
@@ -0,0 +1,1361 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "src/warp_prediction.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/inter_intra_masks.inc"
+
+// Precision bits when scaling reference frames.
+constexpr int kReferenceScaleShift = 14;
+constexpr int kAngleStep = 3;
+constexpr int kPredictionModeToAngle[kIntraPredictionModesUV] = {
+ 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0};
+
+// The following modes need both the left_column and top_row for intra
+// prediction. For directional modes left/top requirement is inferred based on
+// the prediction angle. For Dc modes, left/top requirement is inferred based on
+// whether or not left/top is available.
+constexpr BitMaskSet kNeedsLeftAndTop(kPredictionModeSmooth,
+ kPredictionModeSmoothHorizontal,
+ kPredictionModeSmoothVertical,
+ kPredictionModePaeth);
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+ assert(angle >= 3);
+ assert(angle <= 87);
+ return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+// Maps the block_size to an index as follows:
+// kBlock8x8 => 0.
+// kBlock8x16 => 1.
+// kBlock8x32 => 2.
+// kBlock16x8 => 3.
+// kBlock16x16 => 4.
+// kBlock16x32 => 5.
+// kBlock32x8 => 6.
+// kBlock32x16 => 7.
+// kBlock32x32 => 8.
+int GetWedgeBlockSizeIndex(BlockSize block_size) {
+ assert(block_size >= kBlock8x8);
+ return block_size - kBlock8x8 - static_cast<int>(block_size >= kBlock16x8) -
+ static_cast<int>(block_size >= kBlock32x8);
+}
+
+// Maps a dimension of 4, 8, 16 and 32 to indices 0, 1, 2 and 3 respectively.
+int GetInterIntraMaskLookupIndex(int dimension) {
+ assert(dimension == 4 || dimension == 8 || dimension == 16 ||
+ dimension == 32);
+ return FloorLog2(dimension) - 2;
+}
+
+// 7.11.2.9.
+int GetIntraEdgeFilterStrength(int width, int height, int filter_type,
+ int delta) {
+ const int sum = width + height;
+ delta = std::abs(delta);
+ if (filter_type == 0) {
+ if (sum <= 8) {
+ if (delta >= 56) return 1;
+ } else if (sum <= 16) {
+ if (delta >= 40) return 1;
+ } else if (sum <= 24) {
+ if (delta >= 32) return 3;
+ if (delta >= 16) return 2;
+ if (delta >= 8) return 1;
+ } else if (sum <= 32) {
+ if (delta >= 32) return 3;
+ if (delta >= 4) return 2;
+ return 1;
+ } else {
+ return 3;
+ }
+ } else {
+ if (sum <= 8) {
+ if (delta >= 64) return 2;
+ if (delta >= 40) return 1;
+ } else if (sum <= 16) {
+ if (delta >= 48) return 2;
+ if (delta >= 20) return 1;
+ } else if (sum <= 24) {
+ if (delta >= 4) return 3;
+ } else {
+ return 3;
+ }
+ }
+ return 0;
+}
+
+// 7.11.2.10.
+bool DoIntraEdgeUpsampling(int width, int height, int filter_type, int delta) {
+ const int sum = width + height;
+ delta = std::abs(delta);
+ // This function should not be called when the prediction angle is 90 or 180.
+ assert(delta != 0);
+ if (delta >= 40) return false;
+ return (filter_type == 1) ? sum <= 8 : sum <= 16;
+}
+
+constexpr uint8_t kQuantizedDistanceWeight[4][2] = {
+ {2, 3}, {2, 5}, {2, 7}, {1, kMaxFrameDistance}};
+
+constexpr uint8_t kQuantizedDistanceLookup[4][2] = {
+ {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+void GetDistanceWeights(const int distance[2], int weight[2]) {
+ // Note: distance[0] and distance[1] correspond to relative distance
+ // between current frame and reference frame [1] and [0], respectively.
+ const int order = static_cast<int>(distance[0] <= distance[1]);
+ if (distance[0] == 0 || distance[1] == 0) {
+ weight[0] = kQuantizedDistanceLookup[3][order];
+ weight[1] = kQuantizedDistanceLookup[3][1 - order];
+ } else {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ const int weight_0 = kQuantizedDistanceWeight[i][order];
+ const int weight_1 = kQuantizedDistanceWeight[i][1 - order];
+ if (order == 0) {
+ if (distance[0] * weight_0 < distance[1] * weight_1) break;
+ } else {
+ if (distance[0] * weight_0 > distance[1] * weight_1) break;
+ }
+ }
+ weight[0] = kQuantizedDistanceLookup[i][order];
+ weight[1] = kQuantizedDistanceLookup[i][1 - order];
+ }
+}
+
+dsp::IntraPredictor GetIntraPredictor(PredictionMode mode, bool has_left,
+ bool has_top) {
+ if (mode == kPredictionModeDc) {
+ if (has_left && has_top) {
+ return dsp::kIntraPredictorDc;
+ }
+ if (has_left) {
+ return dsp::kIntraPredictorDcLeft;
+ }
+ if (has_top) {
+ return dsp::kIntraPredictorDcTop;
+ }
+ return dsp::kIntraPredictorDcFill;
+ }
+ switch (mode) {
+ case kPredictionModePaeth:
+ return dsp::kIntraPredictorPaeth;
+ case kPredictionModeSmooth:
+ return dsp::kIntraPredictorSmooth;
+ case kPredictionModeSmoothVertical:
+ return dsp::kIntraPredictorSmoothVertical;
+ case kPredictionModeSmoothHorizontal:
+ return dsp::kIntraPredictorSmoothHorizontal;
+ default:
+ return dsp::kNumIntraPredictors;
+ }
+}
+
+uint8_t* GetStartPoint(Array2DView<uint8_t>* const buffer, const int plane,
+ const int x, const int y, const int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) {
+ Array2DView<uint16_t> buffer16(
+ buffer[plane].rows(), buffer[plane].columns() / sizeof(uint16_t),
+ reinterpret_cast<uint16_t*>(&buffer[plane][0][0]));
+ return reinterpret_cast<uint8_t*>(&buffer16[y][x]);
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ static_cast<void>(bitdepth);
+ return &buffer[plane][y][x];
+}
+
+int GetPixelPositionFromHighScale(int start, int step, int offset) {
+ return (start + step * offset) >> kScaleSubPixelBits;
+}
+
+dsp::MaskBlendFunc GetMaskBlendFunc(const dsp::Dsp& dsp, bool is_inter_intra,
+ bool is_wedge_inter_intra,
+ int subsampling_x, int subsampling_y) {
+ return (is_inter_intra && !is_wedge_inter_intra)
+ ? dsp.mask_blend[0][/*is_inter_intra=*/true]
+ : dsp.mask_blend[subsampling_x + subsampling_y][is_inter_intra];
+}
+
+} // namespace
+
+template <typename Pixel>
+void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
+ bool has_left, bool has_top, bool has_top_right,
+ bool has_bottom_left, PredictionMode mode,
+ TransformSize tx_size) {
+ const int width = 1 << kTransformWidthLog2[tx_size];
+ const int height = 1 << kTransformHeightLog2[tx_size];
+ const int x_shift = subsampling_x_[plane];
+ const int y_shift = subsampling_y_[plane];
+ const int max_x = (MultiplyBy4(frame_header_.columns4x4) >> x_shift) - 1;
+ const int max_y = (MultiplyBy4(frame_header_.rows4x4) >> y_shift) - 1;
+ // For performance reasons, do not initialize the following two buffers.
+ alignas(kMaxAlignment) Pixel top_row_data[160];
+ alignas(kMaxAlignment) Pixel left_column_data[160];
+#if LIBGAV1_MSAN
+ if (IsDirectionalMode(mode)) {
+ memset(top_row_data, 0, sizeof(top_row_data));
+ memset(left_column_data, 0, sizeof(left_column_data));
+ }
+#endif
+ // Some predictors use |top_row_data| and |left_column_data| with a negative
+ // offset to access pixels to the top-left of the current block. So have some
+ // space before the arrays to allow populating those without having to move
+ // the rest of the array.
+ Pixel* const top_row = top_row_data + 16;
+ Pixel* const left_column = left_column_data + 16;
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ const int top_and_left_size = width + height;
+ const bool is_directional_mode = IsDirectionalMode(mode);
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ const bool use_filter_intra =
+ (plane == kPlaneY && prediction_parameters.use_filter_intra);
+ const int prediction_angle =
+ is_directional_mode
+ ? kPredictionModeToAngle[mode] +
+ prediction_parameters.angle_delta[GetPlaneType(plane)] *
+ kAngleStep
+ : 0;
+ // Directional prediction requires buffers larger than the width or height.
+ const int top_size = is_directional_mode ? top_and_left_size : width;
+ const int left_size = is_directional_mode ? top_and_left_size : height;
+ const int top_right_size =
+ is_directional_mode ? (has_top_right ? 2 : 1) * width : width;
+ const int bottom_left_size =
+ is_directional_mode ? (has_bottom_left ? 2 : 1) * height : height;
+
+ Array2DView<Pixel> buffer(buffer_[plane].rows(),
+ buffer_[plane].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+ const bool needs_top = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+ (is_directional_mode && prediction_angle < 180) ||
+ (mode == kPredictionModeDc && has_top);
+ const bool needs_left = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+ (is_directional_mode && prediction_angle > 90) ||
+ (mode == kPredictionModeDc && has_left);
+
+ const Pixel* top_row_src = buffer[y - 1];
+
+ // Determine if we need to retrieve the top row from
+ // |intra_prediction_buffer_|.
+ if ((needs_top || needs_left) && use_intra_prediction_buffer_) {
+ // Superblock index of block.row4x4. block.row4x4 is always in luma
+ // dimension (no subsampling).
+ const int current_superblock_index =
+ block.row4x4 >> (sequence_header_.use_128x128_superblock ? 5 : 4);
+ // Superblock index of y - 1. y is in the plane dimension (chroma planes
+ // could be subsampled).
+ const int plane_shift = (sequence_header_.use_128x128_superblock ? 7 : 6) -
+ subsampling_y_[plane];
+ const int top_row_superblock_index = (y - 1) >> plane_shift;
+ // If the superblock index of y - 1 is not that of the current superblock,
+ // then we will have to retrieve the top row from the
+ // |intra_prediction_buffer_|.
+ if (current_superblock_index != top_row_superblock_index) {
+ top_row_src = reinterpret_cast<const Pixel*>(
+ (*intra_prediction_buffer_)[plane].get());
+ }
+ }
+
+ if (needs_top) {
+ // Compute top_row.
+ if (has_top || has_left) {
+ const int left_index = has_left ? x - 1 : x;
+ top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index];
+ } else {
+ top_row[-1] = 1 << (bitdepth - 1);
+ }
+ if (!has_top && has_left) {
+ Memset(top_row, buffer[y][x - 1], top_size);
+ } else if (!has_top && !has_left) {
+ Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size);
+ } else {
+ const int top_limit = std::min(max_x - x + 1, top_right_size);
+ memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel));
+ // Even though it is safe to call Memset with a size of 0, accessing
+ // top_row_src[top_limit - x + 1] is not allowed when this condition is
+ // false.
+ if (top_size - top_limit > 0) {
+ Memset(top_row + top_limit, top_row_src[top_limit + x - 1],
+ top_size - top_limit);
+ }
+ }
+ }
+ if (needs_left) {
+ // Compute left_column.
+ if (has_top || has_left) {
+ const int left_index = has_left ? x - 1 : x;
+ left_column[-1] =
+ has_top ? top_row_src[left_index] : buffer[y][left_index];
+ } else {
+ left_column[-1] = 1 << (bitdepth - 1);
+ }
+ if (!has_left && has_top) {
+ Memset(left_column, top_row_src[x], left_size);
+ } else if (!has_left && !has_top) {
+ Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size);
+ } else {
+ const int left_limit = std::min(max_y - y + 1, bottom_left_size);
+ for (int i = 0; i < left_limit; ++i) {
+ left_column[i] = buffer[y + i][x - 1];
+ }
+ // Even though it is safe to call Memset with a size of 0, accessing
+ // buffer[left_limit - y + 1][x - 1] is not allowed when this condition is
+ // false.
+ if (left_size - left_limit > 0) {
+ Memset(left_column + left_limit, buffer[left_limit + y - 1][x - 1],
+ left_size - left_limit);
+ }
+ }
+ }
+ Pixel* const dest = &buffer[y][x];
+ const ptrdiff_t dest_stride = buffer_[plane].columns();
+ if (use_filter_intra) {
+ dsp_.filter_intra_predictor(dest, dest_stride, top_row, left_column,
+ prediction_parameters.filter_intra_mode, width,
+ height);
+ } else if (is_directional_mode) {
+ DirectionalPrediction(block, plane, x, y, has_left, has_top, needs_left,
+ needs_top, prediction_angle, width, height, max_x,
+ max_y, tx_size, top_row, left_column);
+ } else {
+ const dsp::IntraPredictor predictor =
+ GetIntraPredictor(mode, has_left, has_top);
+ assert(predictor != dsp::kNumIntraPredictors);
+ dsp_.intra_predictors[tx_size][predictor](dest, dest_stride, top_row,
+ left_column);
+ }
+}
+
+template void Tile::IntraPrediction<uint8_t>(const Block& block, Plane plane,
+ int x, int y, bool has_left,
+ bool has_top, bool has_top_right,
+ bool has_bottom_left,
+ PredictionMode mode,
+ TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::IntraPrediction<uint16_t>(const Block& block, Plane plane,
+ int x, int y, bool has_left,
+ bool has_top, bool has_top_right,
+ bool has_bottom_left,
+ PredictionMode mode,
+ TransformSize tx_size);
+#endif
+
+constexpr BitMaskSet kPredictionModeSmoothMask(kPredictionModeSmooth,
+ kPredictionModeSmoothHorizontal,
+ kPredictionModeSmoothVertical);
+
+bool Tile::IsSmoothPrediction(int row, int column, Plane plane) const {
+ const BlockParameters& bp = *block_parameters_holder_.Find(row, column);
+ PredictionMode mode;
+ if (plane == kPlaneY) {
+ mode = bp.y_mode;
+ } else {
+ if (bp.reference_frame[0] > kReferenceFrameIntra) return false;
+ mode = bp.uv_mode;
+ }
+ return kPredictionModeSmoothMask.Contains(mode);
+}
+
+int Tile::GetIntraEdgeFilterType(const Block& block, Plane plane) const {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ if (block.top_available[plane]) {
+ const int row = block.row4x4 - 1 - (block.row4x4 & subsampling_y);
+ const int column = block.column4x4 + (~block.column4x4 & subsampling_x);
+ if (IsSmoothPrediction(row, column, plane)) return 1;
+ }
+ if (block.left_available[plane]) {
+ const int row = block.row4x4 + (~block.row4x4 & subsampling_y);
+ const int column = block.column4x4 - 1 - (block.column4x4 & subsampling_x);
+ if (IsSmoothPrediction(row, column, plane)) return 1;
+ }
+ return 0;
+}
+
+template <typename Pixel>
+void Tile::DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+ bool has_left, bool has_top, bool needs_left,
+ bool needs_top, int prediction_angle,
+ int width, int height, int max_x, int max_y,
+ TransformSize tx_size, Pixel* const top_row,
+ Pixel* const left_column) {
+ Array2DView<Pixel> buffer(buffer_[plane].rows(),
+ buffer_[plane].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+ Pixel* const dest = &buffer[y][x];
+ const ptrdiff_t stride = buffer_[plane].columns();
+ if (prediction_angle == 90) {
+ dsp_.intra_predictors[tx_size][dsp::kIntraPredictorVertical](
+ dest, stride, top_row, left_column);
+ return;
+ }
+ if (prediction_angle == 180) {
+ dsp_.intra_predictors[tx_size][dsp::kIntraPredictorHorizontal](
+ dest, stride, top_row, left_column);
+ return;
+ }
+
+ bool upsampled_top = false;
+ bool upsampled_left = false;
+ if (sequence_header_.enable_intra_edge_filter) {
+ const int filter_type = GetIntraEdgeFilterType(block, plane);
+ if (prediction_angle > 90 && prediction_angle < 180 &&
+ (width + height) >= 24) {
+ // 7.11.2.7.
+ left_column[-1] = top_row[-1] = RightShiftWithRounding(
+ left_column[0] * 5 + top_row[-1] * 6 + top_row[0] * 5, 4);
+ }
+ if (has_top && needs_top) {
+ const int strength = GetIntraEdgeFilterStrength(
+ width, height, filter_type, prediction_angle - 90);
+ if (strength > 0) {
+ const int num_pixels = std::min(width, max_x - x + 1) +
+ ((prediction_angle < 90) ? height : 0) + 1;
+ dsp_.intra_edge_filter(top_row - 1, num_pixels, strength);
+ }
+ }
+ if (has_left && needs_left) {
+ const int strength = GetIntraEdgeFilterStrength(
+ width, height, filter_type, prediction_angle - 180);
+ if (strength > 0) {
+ const int num_pixels = std::min(height, max_y - y + 1) +
+ ((prediction_angle > 180) ? width : 0) + 1;
+ dsp_.intra_edge_filter(left_column - 1, num_pixels, strength);
+ }
+ }
+ upsampled_top = DoIntraEdgeUpsampling(width, height, filter_type,
+ prediction_angle - 90);
+ if (upsampled_top && needs_top) {
+ const int num_pixels = width + ((prediction_angle < 90) ? height : 0);
+ dsp_.intra_edge_upsampler(top_row, num_pixels);
+ }
+ upsampled_left = DoIntraEdgeUpsampling(width, height, filter_type,
+ prediction_angle - 180);
+ if (upsampled_left && needs_left) {
+ const int num_pixels = height + ((prediction_angle > 180) ? width : 0);
+ dsp_.intra_edge_upsampler(left_column, num_pixels);
+ }
+ }
+
+ if (prediction_angle < 90) {
+ const int dx = GetDirectionalIntraPredictorDerivative(prediction_angle);
+ dsp_.directional_intra_predictor_zone1(dest, stride, top_row, width, height,
+ dx, upsampled_top);
+ } else if (prediction_angle < 180) {
+ const int dx =
+ GetDirectionalIntraPredictorDerivative(180 - prediction_angle);
+ const int dy =
+ GetDirectionalIntraPredictorDerivative(prediction_angle - 90);
+ dsp_.directional_intra_predictor_zone2(dest, stride, top_row, left_column,
+ width, height, dx, dy, upsampled_top,
+ upsampled_left);
+ } else {
+ assert(prediction_angle < 270);
+ const int dy =
+ GetDirectionalIntraPredictorDerivative(270 - prediction_angle);
+ dsp_.directional_intra_predictor_zone3(dest, stride, left_column, width,
+ height, dy, upsampled_left);
+ }
+}
+
+template <typename Pixel>
+void Tile::PalettePrediction(const Block& block, const Plane plane,
+ const int start_x, const int start_y, const int x,
+ const int y, const TransformSize tx_size) {
+ const int tx_width = kTransformWidth[tx_size];
+ const int tx_height = kTransformHeight[tx_size];
+ const uint16_t* const palette = block.bp->palette_mode_info.color[plane];
+ const PlaneType plane_type = GetPlaneType(plane);
+ const int x4 = MultiplyBy4(x);
+ const int y4 = MultiplyBy4(y);
+ Array2DView<Pixel> buffer(buffer_[plane].rows(),
+ buffer_[plane].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+ for (int row = 0; row < tx_height; ++row) {
+ assert(block.bp->prediction_parameters
+ ->color_index_map[plane_type][y4 + row] != nullptr);
+ for (int column = 0; column < tx_width; ++column) {
+ buffer[start_y + row][start_x + column] =
+ palette[block.bp->prediction_parameters
+ ->color_index_map[plane_type][y4 + row][x4 + column]];
+ }
+ }
+}
+
+template void Tile::PalettePrediction<uint8_t>(
+ const Block& block, const Plane plane, const int start_x, const int start_y,
+ const int x, const int y, const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::PalettePrediction<uint16_t>(
+ const Block& block, const Plane plane, const int start_x, const int start_y,
+ const int x, const int y, const TransformSize tx_size);
+#endif
+
+template <typename Pixel>
+void Tile::ChromaFromLumaPrediction(const Block& block, const Plane plane,
+ const int start_x, const int start_y,
+ const TransformSize tx_size) {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ Array2DView<Pixel> y_buffer(
+ buffer_[kPlaneY].rows(), buffer_[kPlaneY].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[kPlaneY][0][0]));
+ if (!block.scratch_buffer->cfl_luma_buffer_valid) {
+ const int luma_x = start_x << subsampling_x;
+ const int luma_y = start_y << subsampling_y;
+ dsp_.cfl_subsamplers[tx_size][subsampling_x + subsampling_y](
+ block.scratch_buffer->cfl_luma_buffer,
+ prediction_parameters.max_luma_width - luma_x,
+ prediction_parameters.max_luma_height - luma_y,
+ reinterpret_cast<uint8_t*>(&y_buffer[luma_y][luma_x]),
+ buffer_[kPlaneY].columns());
+ block.scratch_buffer->cfl_luma_buffer_valid = true;
+ }
+ Array2DView<Pixel> buffer(buffer_[plane].rows(),
+ buffer_[plane].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+ dsp_.cfl_intra_predictors[tx_size](
+ reinterpret_cast<uint8_t*>(&buffer[start_y][start_x]),
+ buffer_[plane].columns(), block.scratch_buffer->cfl_luma_buffer,
+ (plane == kPlaneU) ? prediction_parameters.cfl_alpha_u
+ : prediction_parameters.cfl_alpha_v);
+}
+
+template void Tile::ChromaFromLumaPrediction<uint8_t>(
+ const Block& block, const Plane plane, const int start_x, const int start_y,
+ const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::ChromaFromLumaPrediction<uint16_t>(
+ const Block& block, const Plane plane, const int start_x, const int start_y,
+ const TransformSize tx_size);
+#endif
+
+void Tile::InterIntraPrediction(
+ uint16_t* const prediction_0, const uint8_t* const prediction_mask,
+ const ptrdiff_t prediction_mask_stride,
+ const PredictionParameters& prediction_parameters,
+ const int prediction_width, const int prediction_height,
+ const int subsampling_x, const int subsampling_y, uint8_t* const dest,
+ const ptrdiff_t dest_stride) {
+ assert(prediction_mask != nullptr);
+ assert(prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeIntra ||
+ prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeWedge);
+ // The first buffer of InterIntra is from inter prediction.
+ // The second buffer is from intra prediction.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (sequence_header_.color_config.bitdepth > 8) {
+ GetMaskBlendFunc(dsp_, /*is_inter_intra=*/true,
+ prediction_parameters.is_wedge_inter_intra, subsampling_x,
+ subsampling_y)(
+ prediction_0, reinterpret_cast<uint16_t*>(dest),
+ dest_stride / sizeof(uint16_t), prediction_mask, prediction_mask_stride,
+ prediction_width, prediction_height, dest, dest_stride);
+ return;
+ }
+#endif
+ const int function_index = prediction_parameters.is_wedge_inter_intra
+ ? subsampling_x + subsampling_y
+ : 0;
+ // |is_inter_intra| prediction values are stored in a Pixel buffer but it is
+ // currently declared as a uint16_t buffer.
+ // TODO(johannkoenig): convert the prediction buffer to a uint8_t buffer and
+ // remove the reinterpret_cast.
+ dsp_.inter_intra_mask_blend_8bpp[function_index](
+ reinterpret_cast<uint8_t*>(prediction_0), dest, dest_stride,
+ prediction_mask, prediction_mask_stride, prediction_width,
+ prediction_height);
+}
+
+void Tile::CompoundInterPrediction(
+ const Block& block, const uint8_t* const prediction_mask,
+ const ptrdiff_t prediction_mask_stride, const int prediction_width,
+ const int prediction_height, const int subsampling_x,
+ const int subsampling_y, const int candidate_row,
+ const int candidate_column, uint8_t* dest, const ptrdiff_t dest_stride) {
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+
+ void* prediction[2];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ if (bitdepth > 8) {
+ prediction[0] = block.scratch_buffer->prediction_buffer[0];
+ prediction[1] = block.scratch_buffer->prediction_buffer[1];
+ } else {
+#endif
+ prediction[0] = block.scratch_buffer->compound_prediction_buffer_8bpp[0];
+ prediction[1] = block.scratch_buffer->compound_prediction_buffer_8bpp[1];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif
+
+ switch (prediction_parameters.compound_prediction_type) {
+ case kCompoundPredictionTypeWedge:
+ case kCompoundPredictionTypeDiffWeighted:
+ GetMaskBlendFunc(dsp_, /*is_inter_intra=*/false,
+ prediction_parameters.is_wedge_inter_intra,
+ subsampling_x, subsampling_y)(
+ prediction[0], prediction[1],
+ /*prediction_stride=*/prediction_width, prediction_mask,
+ prediction_mask_stride, prediction_width, prediction_height, dest,
+ dest_stride);
+ break;
+ case kCompoundPredictionTypeDistance:
+ DistanceWeightedPrediction(prediction[0], prediction[1], prediction_width,
+ prediction_height, candidate_row,
+ candidate_column, dest, dest_stride);
+ break;
+ default:
+ assert(prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeAverage);
+ dsp_.average_blend(prediction[0], prediction[1], prediction_width,
+ prediction_height, dest, dest_stride);
+ break;
+ }
+}
+
+GlobalMotion* Tile::GetWarpParams(
+ const Block& block, const Plane plane, const int prediction_width,
+ const int prediction_height,
+ const PredictionParameters& prediction_parameters,
+ const ReferenceFrameType reference_type, bool* const is_local_valid,
+ GlobalMotion* const global_motion_params,
+ GlobalMotion* const local_warp_params) const {
+ if (prediction_width < 8 || prediction_height < 8 ||
+ frame_header_.force_integer_mv == 1) {
+ return nullptr;
+ }
+ if (plane == kPlaneY) {
+ *is_local_valid =
+ prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+ WarpEstimation(
+ prediction_parameters.num_warp_samples, DivideBy4(prediction_width),
+ DivideBy4(prediction_height), block.row4x4, block.column4x4,
+ block.bp->mv.mv[0], prediction_parameters.warp_estimate_candidates,
+ local_warp_params) &&
+ SetupShear(local_warp_params);
+ }
+ if (prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+ *is_local_valid) {
+ return local_warp_params;
+ }
+ if (!IsScaled(reference_type)) {
+ GlobalMotionTransformationType global_motion_type =
+ (reference_type != kReferenceFrameIntra)
+ ? global_motion_params->type
+ : kNumGlobalMotionTransformationTypes;
+ const bool is_global_valid =
+ IsGlobalMvBlock(block.bp->is_global_mv_block, global_motion_type) &&
+ SetupShear(global_motion_params);
+ // Valid global motion type implies reference type can't be intra.
+ assert(!is_global_valid || reference_type != kReferenceFrameIntra);
+ if (is_global_valid) return global_motion_params;
+ }
+ return nullptr;
+}
+
+bool Tile::InterPrediction(const Block& block, const Plane plane, const int x,
+ const int y, const int prediction_width,
+ const int prediction_height, int candidate_row,
+ int candidate_column, bool* const is_local_valid,
+ GlobalMotion* const local_warp_params) {
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ const BlockParameters& bp = *block.bp;
+ const BlockParameters& bp_reference =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ const bool is_compound =
+ bp_reference.reference_frame[1] > kReferenceFrameIntra;
+ assert(bp.is_inter);
+ const bool is_inter_intra = bp.reference_frame[1] == kReferenceFrameIntra;
+
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ uint8_t* const dest = GetStartPoint(buffer_, plane, x, y, bitdepth);
+ const ptrdiff_t dest_stride = buffer_[plane].columns(); // In bytes.
+ for (int index = 0; index < 1 + static_cast<int>(is_compound); ++index) {
+ const ReferenceFrameType reference_type =
+ bp_reference.reference_frame[index];
+ GlobalMotion global_motion_params =
+ frame_header_.global_motion[reference_type];
+ GlobalMotion* warp_params =
+ GetWarpParams(block, plane, prediction_width, prediction_height,
+ prediction_parameters, reference_type, is_local_valid,
+ &global_motion_params, local_warp_params);
+ if (warp_params != nullptr) {
+ if (!BlockWarpProcess(block, plane, index, x, y, prediction_width,
+ prediction_height, warp_params, is_compound,
+ is_inter_intra, dest, dest_stride)) {
+ return false;
+ }
+ } else {
+ const int reference_index =
+ prediction_parameters.use_intra_block_copy
+ ? -1
+ : frame_header_.reference_frame_index[reference_type -
+ kReferenceFrameLast];
+ if (!BlockInterPrediction(
+ block, plane, reference_index, bp_reference.mv.mv[index], x, y,
+ prediction_width, prediction_height, candidate_row,
+ candidate_column, block.scratch_buffer->prediction_buffer[index],
+ is_compound, is_inter_intra, dest, dest_stride)) {
+ return false;
+ }
+ }
+ }
+
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ ptrdiff_t prediction_mask_stride = 0;
+ const uint8_t* prediction_mask = nullptr;
+ if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeWedge) {
+ const Array2D<uint8_t>& wedge_mask =
+ wedge_masks_[GetWedgeBlockSizeIndex(block.size)]
+ [prediction_parameters.wedge_sign]
+ [prediction_parameters.wedge_index];
+ prediction_mask = wedge_mask[0];
+ prediction_mask_stride = wedge_mask.columns();
+ } else if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeIntra) {
+ // 7.11.3.13. The inter intra masks are precomputed and stored as a set of
+ // look up tables.
+ assert(prediction_parameters.inter_intra_mode < kNumInterIntraModes);
+ prediction_mask =
+ kInterIntraMasks[prediction_parameters.inter_intra_mode]
+ [GetInterIntraMaskLookupIndex(prediction_width)]
+ [GetInterIntraMaskLookupIndex(prediction_height)];
+ prediction_mask_stride = prediction_width;
+ } else if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeDiffWeighted) {
+ if (plane == kPlaneY) {
+ assert(prediction_width >= 8);
+ assert(prediction_height >= 8);
+ dsp_.weight_mask[FloorLog2(prediction_width) - 3]
+ [FloorLog2(prediction_height) - 3]
+ [static_cast<int>(prediction_parameters.mask_is_inverse)](
+ block.scratch_buffer->prediction_buffer[0],
+ block.scratch_buffer->prediction_buffer[1],
+ block.scratch_buffer->weight_mask,
+ kMaxSuperBlockSizeInPixels);
+ }
+ prediction_mask = block.scratch_buffer->weight_mask;
+ prediction_mask_stride = kMaxSuperBlockSizeInPixels;
+ }
+
+ if (is_compound) {
+ CompoundInterPrediction(block, prediction_mask, prediction_mask_stride,
+ prediction_width, prediction_height, subsampling_x,
+ subsampling_y, candidate_row, candidate_column,
+ dest, dest_stride);
+ } else if (prediction_parameters.motion_mode == kMotionModeObmc) {
+ // Obmc mode is allowed only for single reference (!is_compound).
+ return ObmcPrediction(block, plane, prediction_width, prediction_height);
+ } else if (is_inter_intra) {
+ // InterIntra and obmc must be mutually exclusive.
+ InterIntraPrediction(
+ block.scratch_buffer->prediction_buffer[0], prediction_mask,
+ prediction_mask_stride, prediction_parameters, prediction_width,
+ prediction_height, subsampling_x, subsampling_y, dest, dest_stride);
+ }
+ return true;
+}
+
+bool Tile::ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+ const Plane plane,
+ const int reference_frame_index, const int width,
+ const int height, const int x, const int y,
+ const int candidate_row,
+ const int candidate_column,
+ const ObmcDirection blending_direction) {
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ // Obmc's prediction needs to be clipped before blending with above/left
+ // prediction blocks.
+ // Obmc prediction is used only when is_compound is false. So it is safe to
+ // use prediction_buffer[1] as a temporary buffer for the Obmc prediction.
+ static_assert(sizeof(block.scratch_buffer->prediction_buffer[1]) >=
+ 64 * 64 * sizeof(uint16_t),
+ "");
+ auto* const obmc_buffer =
+ reinterpret_cast<uint8_t*>(block.scratch_buffer->prediction_buffer[1]);
+ const ptrdiff_t obmc_buffer_stride =
+ (bitdepth == 8) ? width : width * sizeof(uint16_t);
+ if (!BlockInterPrediction(block, plane, reference_frame_index, mv, x, y,
+ width, height, candidate_row, candidate_column,
+ nullptr, false, false, obmc_buffer,
+ obmc_buffer_stride)) {
+ return false;
+ }
+
+ uint8_t* const prediction = GetStartPoint(buffer_, plane, x, y, bitdepth);
+ const ptrdiff_t prediction_stride = buffer_[plane].columns();
+ dsp_.obmc_blend[blending_direction](prediction, prediction_stride, width,
+ height, obmc_buffer, obmc_buffer_stride);
+ return true;
+}
+
+bool Tile::ObmcPrediction(const Block& block, const Plane plane,
+ const int width, const int height) {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ if (block.top_available[kPlaneY] &&
+ !IsBlockSmallerThan8x8(block.residual_size[plane])) {
+ const int num_limit = std::min(uint8_t{4}, k4x4WidthLog2[block.size]);
+ const int column4x4_max =
+ std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+ const int candidate_row = block.row4x4 - 1;
+ const int block_start_y = MultiplyBy4(block.row4x4) >> subsampling_y;
+ int column4x4 = block.column4x4;
+ const int prediction_height = std::min(height >> 1, 32 >> subsampling_y);
+ for (int i = 0, step; i < num_limit && column4x4 < column4x4_max;
+ column4x4 += step) {
+ const int candidate_column = column4x4 | 1;
+ const BlockParameters& bp_top =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ const int candidate_block_size = bp_top.size;
+ step = Clip3(kNum4x4BlocksWide[candidate_block_size], 2, 16);
+ if (bp_top.reference_frame[0] > kReferenceFrameIntra) {
+ i++;
+ const int candidate_reference_frame_index =
+ frame_header_.reference_frame_index[bp_top.reference_frame[0] -
+ kReferenceFrameLast];
+ const int prediction_width =
+ std::min(width, MultiplyBy4(step) >> subsampling_x);
+ if (!ObmcBlockPrediction(
+ block, bp_top.mv.mv[0], plane, candidate_reference_frame_index,
+ prediction_width, prediction_height,
+ MultiplyBy4(column4x4) >> subsampling_x, block_start_y,
+ candidate_row, candidate_column, kObmcDirectionVertical)) {
+ return false;
+ }
+ }
+ }
+ }
+
+ if (block.left_available[kPlaneY]) {
+ const int num_limit = std::min(uint8_t{4}, k4x4HeightLog2[block.size]);
+ const int row4x4_max =
+ std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+ const int candidate_column = block.column4x4 - 1;
+ int row4x4 = block.row4x4;
+ const int block_start_x = MultiplyBy4(block.column4x4) >> subsampling_x;
+ const int prediction_width = std::min(width >> 1, 32 >> subsampling_x);
+ for (int i = 0, step; i < num_limit && row4x4 < row4x4_max;
+ row4x4 += step) {
+ const int candidate_row = row4x4 | 1;
+ const BlockParameters& bp_left =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ const int candidate_block_size = bp_left.size;
+ step = Clip3(kNum4x4BlocksHigh[candidate_block_size], 2, 16);
+ if (bp_left.reference_frame[0] > kReferenceFrameIntra) {
+ i++;
+ const int candidate_reference_frame_index =
+ frame_header_.reference_frame_index[bp_left.reference_frame[0] -
+ kReferenceFrameLast];
+ const int prediction_height =
+ std::min(height, MultiplyBy4(step) >> subsampling_y);
+ if (!ObmcBlockPrediction(
+ block, bp_left.mv.mv[0], plane, candidate_reference_frame_index,
+ prediction_width, prediction_height, block_start_x,
+ MultiplyBy4(row4x4) >> subsampling_y, candidate_row,
+ candidate_column, kObmcDirectionHorizontal)) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+ const int width, const int height,
+ const int candidate_row,
+ const int candidate_column, uint8_t* dest,
+ ptrdiff_t dest_stride) {
+ int distance[2];
+ int weight[2];
+ for (int reference = 0; reference < 2; ++reference) {
+ const BlockParameters& bp =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ // Note: distance[0] and distance[1] correspond to relative distance
+ // between current frame and reference frame [1] and [0], respectively.
+ distance[1 - reference] = std::min(
+ std::abs(static_cast<int>(
+ current_frame_.reference_info()
+ ->relative_distance_from[bp.reference_frame[reference]])),
+ static_cast<int>(kMaxFrameDistance));
+ }
+ GetDistanceWeights(distance, weight);
+
+ dsp_.distance_weighted_blend(prediction_0, prediction_1, weight[0], weight[1],
+ width, height, dest, dest_stride);
+}
+
+void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
+ const int reference_frame_index, const int x,
+ const int y, int* const start_x,
+ int* const start_y, int* const step_x,
+ int* const step_y) {
+ const int reference_upscaled_width =
+ (reference_frame_index == -1)
+ ? frame_header_.upscaled_width
+ : reference_frames_[reference_frame_index]->upscaled_width();
+ const int reference_height =
+ (reference_frame_index == -1)
+ ? frame_header_.height
+ : reference_frames_[reference_frame_index]->frame_height();
+ assert(2 * frame_header_.width >= reference_upscaled_width &&
+ 2 * frame_header_.height >= reference_height &&
+ frame_header_.width <= 16 * reference_upscaled_width &&
+ frame_header_.height <= 16 * reference_height);
+ const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
+ const bool is_scaled_y = reference_height != frame_header_.height;
+ const int half_sample = 1 << (kSubPixelBits - 1);
+ int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
+ int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
+ const int rounding_offset =
+ DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
+ if (is_scaled_x) {
+ const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
+ DivideBy2(frame_header_.width)) /
+ frame_header_.width;
+ *step_x = RightShiftWithRoundingSigned(
+ scale_x, kReferenceScaleShift - kScaleSubPixelBits);
+ orig_x += half_sample;
+ // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
+ // be up to 15 bits. So we use int64_t to hold base_x.
+ const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
+ (half_sample << kReferenceScaleShift);
+ *start_x =
+ RightShiftWithRoundingSigned(
+ base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+ rounding_offset;
+ } else {
+ *step_x = 1 << kScaleSubPixelBits;
+ *start_x = LeftShift(orig_x, 6) + rounding_offset;
+ }
+ if (is_scaled_y) {
+ const int scale_y = ((reference_height << kReferenceScaleShift) +
+ DivideBy2(frame_header_.height)) /
+ frame_header_.height;
+ *step_y = RightShiftWithRoundingSigned(
+ scale_y, kReferenceScaleShift - kScaleSubPixelBits);
+ orig_y += half_sample;
+ const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
+ (half_sample << kReferenceScaleShift);
+ *start_y =
+ RightShiftWithRoundingSigned(
+ base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+ rounding_offset;
+ } else {
+ *step_y = 1 << kScaleSubPixelBits;
+ *start_y = LeftShift(orig_y, 6) + rounding_offset;
+ }
+}
+
+// static.
+bool Tile::GetReferenceBlockPosition(
+ const int reference_frame_index, const bool is_scaled, const int width,
+ const int height, const int ref_start_x, const int ref_last_x,
+ const int ref_start_y, const int ref_last_y, const int start_x,
+ const int start_y, const int step_x, const int step_y,
+ const int left_border, const int right_border, const int top_border,
+ const int bottom_border, int* ref_block_start_x, int* ref_block_start_y,
+ int* ref_block_end_x) {
+ *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0);
+ *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0);
+ if (reference_frame_index == -1) {
+ return false;
+ }
+ *ref_block_start_x -= kConvolveBorderLeftTop;
+ *ref_block_start_y -= kConvolveBorderLeftTop;
+ *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) +
+ kConvolveBorderRight;
+ int ref_block_end_y =
+ GetPixelPositionFromHighScale(start_y, step_y, height - 1) +
+ kConvolveBorderBottom;
+ if (is_scaled) {
+ const int block_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ kSubPixelTaps;
+ ref_block_end_y = *ref_block_start_y + block_height - 1;
+ }
+ // Determines if we need to extend beyond the left/right/top/bottom border.
+ return *ref_block_start_x < (ref_start_x - left_border) ||
+ *ref_block_end_x > (ref_last_x + right_border) ||
+ *ref_block_start_y < (ref_start_y - top_border) ||
+ ref_block_end_y > (ref_last_y + bottom_border);
+}
+
+// Builds a block as the input for convolve, by copying the content of
+// reference frame (either a decoded reference frame, or current frame).
+// |block_extended_width| is the combined width of the block and its borders.
+template <typename Pixel>
+void Tile::BuildConvolveBlock(
+ const Plane plane, const int reference_frame_index, const bool is_scaled,
+ const int height, const int ref_start_x, const int ref_last_x,
+ const int ref_start_y, const int ref_last_y, const int step_y,
+ const int ref_block_start_x, const int ref_block_end_x,
+ const int ref_block_start_y, uint8_t* block_buffer,
+ ptrdiff_t convolve_buffer_stride, ptrdiff_t block_extended_width) {
+ const YuvBuffer* const reference_buffer =
+ (reference_frame_index == -1)
+ ? current_frame_.buffer()
+ : reference_frames_[reference_frame_index]->buffer();
+ Array2DView<const Pixel> reference_block(
+ reference_buffer->height(plane),
+ reference_buffer->stride(plane) / sizeof(Pixel),
+ reinterpret_cast<const Pixel*>(reference_buffer->data(plane)));
+ auto* const block_head = reinterpret_cast<Pixel*>(block_buffer);
+ convolve_buffer_stride /= sizeof(Pixel);
+ int block_height = height + kConvolveBorderLeftTop + kConvolveBorderBottom;
+ if (is_scaled) {
+ block_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ kSubPixelTaps;
+ }
+ const int copy_start_x = Clip3(ref_block_start_x, ref_start_x, ref_last_x);
+ const int copy_start_y = Clip3(ref_block_start_y, ref_start_y, ref_last_y);
+ const int copy_end_x = Clip3(ref_block_end_x, copy_start_x, ref_last_x);
+ const int block_width = copy_end_x - copy_start_x + 1;
+ const bool extend_left = ref_block_start_x < ref_start_x;
+ const bool extend_right = ref_block_end_x > ref_last_x;
+ const bool out_of_left = copy_start_x > ref_block_end_x;
+ const bool out_of_right = copy_end_x < ref_block_start_x;
+ if (out_of_left || out_of_right) {
+ const int ref_x = out_of_left ? copy_start_x : copy_end_x;
+ Pixel* buf_ptr = block_head;
+ for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+ Memset(buf_ptr, reference_block[ref_y][ref_x], block_extended_width);
+ if (ref_block_start_y + y >= ref_start_y &&
+ ref_block_start_y + y < ref_last_y) {
+ ++ref_y;
+ }
+ buf_ptr += convolve_buffer_stride;
+ }
+ } else {
+ Pixel* buf_ptr = block_head;
+ const int left_width = copy_start_x - ref_block_start_x;
+ for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+ if (extend_left) {
+ Memset(buf_ptr, reference_block[ref_y][copy_start_x], left_width);
+ }
+ memcpy(buf_ptr + left_width, &reference_block[ref_y][copy_start_x],
+ block_width * sizeof(Pixel));
+ if (extend_right) {
+ Memset(buf_ptr + left_width + block_width,
+ reference_block[ref_y][copy_end_x],
+ block_extended_width - left_width - block_width);
+ }
+ if (ref_block_start_y + y >= ref_start_y &&
+ ref_block_start_y + y < ref_last_y) {
+ ++ref_y;
+ }
+ buf_ptr += convolve_buffer_stride;
+ }
+ }
+}
+
+bool Tile::BlockInterPrediction(
+ const Block& block, const Plane plane, const int reference_frame_index,
+ const MotionVector& mv, const int x, const int y, const int width,
+ const int height, const int candidate_row, const int candidate_column,
+ uint16_t* const prediction, const bool is_compound,
+ const bool is_inter_intra, uint8_t* const dest,
+ const ptrdiff_t dest_stride) {
+ const BlockParameters& bp =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ int start_x;
+ int start_y;
+ int step_x;
+ int step_y;
+ ScaleMotionVector(mv, plane, reference_frame_index, x, y, &start_x, &start_y,
+ &step_x, &step_y);
+ const int horizontal_filter_index = bp.interpolation_filter[1];
+ const int vertical_filter_index = bp.interpolation_filter[0];
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ // reference_frame_index equal to -1 indicates using current frame as
+ // reference.
+ const YuvBuffer* const reference_buffer =
+ (reference_frame_index == -1)
+ ? current_frame_.buffer()
+ : reference_frames_[reference_frame_index]->buffer();
+ const int reference_upscaled_width =
+ (reference_frame_index == -1)
+ ? MultiplyBy4(frame_header_.columns4x4)
+ : reference_frames_[reference_frame_index]->upscaled_width();
+ const int reference_height =
+ (reference_frame_index == -1)
+ ? MultiplyBy4(frame_header_.rows4x4)
+ : reference_frames_[reference_frame_index]->frame_height();
+ const int ref_start_x = 0;
+ const int ref_last_x =
+ SubsampledValue(reference_upscaled_width, subsampling_x) - 1;
+ const int ref_start_y = 0;
+ const int ref_last_y = SubsampledValue(reference_height, subsampling_y) - 1;
+
+ const bool is_scaled = (reference_frame_index != -1) &&
+ (frame_header_.width != reference_upscaled_width ||
+ frame_header_.height != reference_height);
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ const int pixel_size = (bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+ int ref_block_start_x;
+ int ref_block_start_y;
+ int ref_block_end_x;
+ const bool extend_block = GetReferenceBlockPosition(
+ reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x,
+ ref_start_y, ref_last_y, start_x, start_y, step_x, step_y,
+ reference_buffer->left_border(plane),
+ reference_buffer->right_border(plane),
+ reference_buffer->top_border(plane),
+ reference_buffer->bottom_border(plane), &ref_block_start_x,
+ &ref_block_start_y, &ref_block_end_x);
+
+ // In frame parallel mode, ensure that the reference block has been decoded
+ // and available for referencing.
+ if (reference_frame_index != -1 && frame_parallel_) {
+ int reference_y_max;
+ if (is_scaled) {
+ // TODO(vigneshv): For now, we wait for the entire reference frame to be
+ // decoded if we are using scaled references. This will eventually be
+ // fixed.
+ reference_y_max = reference_height;
+ } else {
+ reference_y_max =
+ std::min(ref_block_start_y + height + kSubPixelTaps, ref_last_y);
+ // For U and V planes with subsampling, we need to multiply
+ // reference_y_max by 2 since we only track the progress of Y planes.
+ reference_y_max = LeftShift(reference_y_max, subsampling_y);
+ }
+ if (reference_frame_progress_cache_[reference_frame_index] <
+ reference_y_max &&
+ !reference_frames_[reference_frame_index]->WaitUntil(
+ reference_y_max,
+ &reference_frame_progress_cache_[reference_frame_index])) {
+ return false;
+ }
+ }
+
+ const uint8_t* block_start = nullptr;
+ ptrdiff_t convolve_buffer_stride;
+ if (!extend_block) {
+ const YuvBuffer* const reference_buffer =
+ (reference_frame_index == -1)
+ ? current_frame_.buffer()
+ : reference_frames_[reference_frame_index]->buffer();
+ convolve_buffer_stride = reference_buffer->stride(plane);
+ if (reference_frame_index == -1 || is_scaled) {
+ block_start = reference_buffer->data(plane) +
+ ref_block_start_y * reference_buffer->stride(plane) +
+ ref_block_start_x * pixel_size;
+ } else {
+ block_start = reference_buffer->data(plane) +
+ (ref_block_start_y + kConvolveBorderLeftTop) *
+ reference_buffer->stride(plane) +
+ (ref_block_start_x + kConvolveBorderLeftTop) * pixel_size;
+ }
+ } else {
+ // The block width can be at most 2 times as much as current
+ // block's width because of scaling.
+ auto block_extended_width = Align<ptrdiff_t>(
+ (2 * width + kConvolveBorderLeftTop + kConvolveBorderRight) *
+ pixel_size,
+ kMaxAlignment);
+ convolve_buffer_stride = block.scratch_buffer->convolve_block_buffer_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) {
+ BuildConvolveBlock<uint16_t>(
+ plane, reference_frame_index, is_scaled, height, ref_start_x,
+ ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+ ref_block_end_x, ref_block_start_y,
+ block.scratch_buffer->convolve_block_buffer.get(),
+ convolve_buffer_stride, block_extended_width);
+ } else {
+#endif
+ BuildConvolveBlock<uint8_t>(
+ plane, reference_frame_index, is_scaled, height, ref_start_x,
+ ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+ ref_block_end_x, ref_block_start_y,
+ block.scratch_buffer->convolve_block_buffer.get(),
+ convolve_buffer_stride, block_extended_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif
+ block_start = block.scratch_buffer->convolve_block_buffer.get() +
+ (is_scaled ? 0
+ : kConvolveBorderLeftTop * convolve_buffer_stride +
+ kConvolveBorderLeftTop * pixel_size);
+ }
+
+ void* const output =
+ (is_compound || is_inter_intra) ? prediction : static_cast<void*>(dest);
+ ptrdiff_t output_stride = (is_compound || is_inter_intra)
+ ? /*prediction_stride=*/width
+ : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // |is_inter_intra| calculations are written to the |prediction| buffer.
+ // Unlike the |is_compound| calculations the output is Pixel and not uint16_t.
+ // convolve_func() expects |output_stride| to be in bytes and not Pixels.
+ // |prediction_stride| is in units of uint16_t. Adjust |output_stride| to
+ // account for this.
+ if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+ output_stride *= 2;
+ }
+#endif
+ assert(output != nullptr);
+ if (is_scaled) {
+ dsp::ConvolveScaleFunc convolve_func = dsp_.convolve_scale[is_compound];
+ assert(convolve_func != nullptr);
+
+ convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+ vertical_filter_index, start_x, start_y, step_x, step_y,
+ width, height, output, output_stride);
+ } else {
+ const int horizontal_filter_id = (start_x >> 6) & kSubPixelMask;
+ const int vertical_filter_id = (start_y >> 6) & kSubPixelMask;
+
+ dsp::ConvolveFunc convolve_func =
+ dsp_.convolve[reference_frame_index == -1][is_compound]
+ [vertical_filter_id != 0][horizontal_filter_id != 0];
+ assert(convolve_func != nullptr);
+
+ convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+ vertical_filter_index, horizontal_filter_id,
+ vertical_filter_id, width, height, output, output_stride);
+ }
+ return true;
+}
+
+bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
+ const int index, const int block_start_x,
+ const int block_start_y, const int width,
+ const int height, GlobalMotion* const warp_params,
+ const bool is_compound, const bool is_inter_intra,
+ uint8_t* const dest, const ptrdiff_t dest_stride) {
+ assert(width >= 8 && height >= 8);
+ const BlockParameters& bp = *block.bp;
+ const int reference_frame_index =
+ frame_header_.reference_frame_index[bp.reference_frame[index] -
+ kReferenceFrameLast];
+ const uint8_t* const source =
+ reference_frames_[reference_frame_index]->buffer()->data(plane);
+ ptrdiff_t source_stride =
+ reference_frames_[reference_frame_index]->buffer()->stride(plane);
+ const int source_width =
+ reference_frames_[reference_frame_index]->buffer()->width(plane);
+ const int source_height =
+ reference_frames_[reference_frame_index]->buffer()->height(plane);
+ uint16_t* const prediction = block.scratch_buffer->prediction_buffer[index];
+
+ // In frame parallel mode, ensure that the reference block has been decoded
+ // and available for referencing.
+ if (frame_parallel_) {
+ int reference_y_max = -1;
+ // Find out the maximum y-coordinate for warping.
+ for (int start_y = block_start_y; start_y < block_start_y + height;
+ start_y += 8) {
+ for (int start_x = block_start_x; start_x < block_start_x + width;
+ start_x += 8) {
+ const int src_x = (start_x + 4) << subsampling_x_[plane];
+ const int src_y = (start_y + 4) << subsampling_y_[plane];
+ const int dst_y = src_x * warp_params->params[4] +
+ src_y * warp_params->params[5] +
+ warp_params->params[1];
+ const int y4 = dst_y >> subsampling_y_[plane];
+ const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ reference_y_max = std::max(iy4 + 8, reference_y_max);
+ }
+ }
+ // For U and V planes with subsampling, we need to multiply reference_y_max
+ // by 2 since we only track the progress of Y planes.
+ reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]);
+ if (reference_frame_progress_cache_[reference_frame_index] <
+ reference_y_max &&
+ !reference_frames_[reference_frame_index]->WaitUntil(
+ reference_y_max,
+ &reference_frame_progress_cache_[reference_frame_index])) {
+ return false;
+ }
+ }
+ if (is_compound) {
+ dsp_.warp_compound(source, source_stride, source_width, source_height,
+ warp_params->params, subsampling_x_[plane],
+ subsampling_y_[plane], block_start_x, block_start_y,
+ width, height, warp_params->alpha, warp_params->beta,
+ warp_params->gamma, warp_params->delta, prediction,
+ /*prediction_stride=*/width);
+ } else {
+ void* const output = is_inter_intra ? static_cast<void*>(prediction) : dest;
+ ptrdiff_t output_stride =
+ is_inter_intra ? /*prediction_stride=*/width : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // |is_inter_intra| calculations are written to the |prediction| buffer.
+ // Unlike the |is_compound| calculations the output is Pixel and not
+ // uint16_t. warp_clip() expects |output_stride| to be in bytes and not
+ // Pixels. |prediction_stride| is in units of uint16_t. Adjust
+ // |output_stride| to account for this.
+ if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+ output_stride *= 2;
+ }
+#endif
+ dsp_.warp(source, source_stride, source_width, source_height,
+ warp_params->params, subsampling_x_[plane], subsampling_y_[plane],
+ block_start_x, block_start_y, width, height, warp_params->alpha,
+ warp_params->beta, warp_params->gamma, warp_params->delta, output,
+ output_stride);
+ }
+ return true;
+}
+
+} // namespace libgav1
diff --git a/src/tile/tile.cc b/src/tile/tile.cc
new file mode 100644
index 0000000..ee48f17
--- /dev/null
+++ b/src/tile/tile.cc
@@ -0,0 +1,2573 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/motion_vector.h"
+#include "src/reconstruction.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+// Range above kNumQuantizerBaseLevels which the exponential golomb coding
+// process is activated.
+constexpr int kQuantizerCoefficientBaseRange = 12;
+constexpr int kNumQuantizerBaseLevels = 2;
+constexpr int kCoeffBaseRangeMaxIterations =
+ kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
+constexpr int kEntropyContextLeft = 0;
+constexpr int kEntropyContextTop = 1;
+
+constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3},
+ {2, 4, 4, 4, 5},
+ {2, 4, 4, 4, 5},
+ {2, 4, 4, 4, 5},
+ {3, 5, 5, 5, 6}};
+
+// The space complexity of DFS is O(branching_factor * max_depth). For the
+// parameter tree, branching_factor = 4 (there could be up to 4 children for
+// every node) and max_depth (excluding the root) = 5 (to go from a 128x128
+// block all the way to a 4x4 block). The worse-case stack size is 16, by
+// counting the number of 'o' nodes in the diagram:
+//
+// | 128x128 The highest level (corresponding to the
+// | root of the tree) has no node in the stack.
+// |-----------------+
+// | | | |
+// | o o o 64x64
+// |
+// |-----------------+
+// | | | |
+// | o o o 32x32 Higher levels have three nodes in the stack,
+// | because we pop one node off the stack before
+// |-----------------+ pushing its four children onto the stack.
+// | | | |
+// | o o o 16x16
+// |
+// |-----------------+
+// | | | |
+// | o o o 8x8
+// |
+// |-----------------+
+// | | | |
+// o o o o 4x4 Only the lowest level has four nodes in the
+// stack.
+constexpr int kDfsStackSize = 16;
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+ BitMaskSet(0x1), BitMaskSet(0xE0F), BitMaskSet(0x20F),
+ BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+constexpr PredictionMode
+ kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = {
+ kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+ kPredictionModeD157, kPredictionModeDc};
+
+// Mask used to determine the index for mode_deltas lookup.
+constexpr BitMaskSet kPredictionModeDeltasMask(
+ kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
+ kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
+ kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
+ kPredictionModeNearNewMv, kPredictionModeNewNearMv,
+ kPredictionModeNewNewMv);
+
+// This is computed as:
+// min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
+constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
+ 0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6};
+
+/* clang-format off */
+constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
+ {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0},
+ {0, 0, 0, 0, 0}},
+ {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+ {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+ {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+ {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+ {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
+/* clang-format on */
+
+// Extended the table size from 3 to 16 by repeating the last element to avoid
+// the clips to row or column indices.
+constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
+ 26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
+
+constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
+ kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+ kPredictionModeSmooth};
+
+// Number of horizontal luma samples before intra block copy can be used.
+constexpr int kIntraBlockCopyDelayPixels = 256;
+// Number of 64 by 64 blocks before intra block copy can be used.
+constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64;
+
+// Index [i][j] corresponds to the transform size of width 1 << (i + 2) and
+// height 1 << (j + 2).
+constexpr TransformSize k4x4SizeToTransformSize[5][5] = {
+ {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kNumTransformSizes, kNumTransformSizes},
+ {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kNumTransformSizes},
+ {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16,
+ kTransformSize16x32, kTransformSize16x64},
+ {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16,
+ kTransformSize32x32, kTransformSize32x64},
+ {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16,
+ kTransformSize64x32, kTransformSize64x64}};
+
+// Defined in section 9.3 of the spec.
+constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = {
+ kTransformTypeDctDct, kTransformTypeDctAdst, kTransformTypeAdstDct,
+ kTransformTypeDctDct, kTransformTypeAdstAdst, kTransformTypeDctAdst,
+ kTransformTypeAdstDct, kTransformTypeAdstDct, kTransformTypeDctAdst,
+ kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct,
+ kTransformTypeAdstAdst, kTransformTypeDctDct};
+
+// Defined in section 5.11.47 of the spec. This array does not contain an entry
+// for kTransformSetDctOnly, so the first dimension needs to be
+// |kNumTransformSets| - 1.
+constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] =
+ {{kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+ kTransformTypeIdentityDct, kTransformTypeDctIdentity,
+ kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+ {kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+ kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+ {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+ kTransformTypeDctIdentity, kTransformTypeIdentityAdst,
+ kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst,
+ kTransformTypeFlipadstIdentity, kTransformTypeDctDct,
+ kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+ kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+ kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+ kTransformTypeAdstFlipadst},
+ {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+ kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst,
+ kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+ kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+ kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+ kTransformTypeAdstFlipadst},
+ {kTransformTypeIdentityIdentity, kTransformTypeDctDct}};
+
+// Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively.
+constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32};
+
+// This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x
+// transforms replaced with *x32 and 32x* respectively.
+constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+ kTransformSize32x32};
+
+// ith entry of this array is computed as:
+// DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) +
+// TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) +
+// 1)
+constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
+ 0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4};
+
+constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
+
+constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
+
+// Maps compound prediction modes into single modes. For e.g.
+// kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
+// and kPredictionModeNewMv for index 1. It is used to simplify the logic in
+// AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec.
+constexpr PredictionMode
+ kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = {
+ {kPredictionModeNearestMv, kPredictionModeNearestMv},
+ {kPredictionModeNearMv, kPredictionModeNearMv},
+ {kPredictionModeNearestMv, kPredictionModeNewMv},
+ {kPredictionModeNewMv, kPredictionModeNearestMv},
+ {kPredictionModeNearMv, kPredictionModeNewMv},
+ {kPredictionModeNewMv, kPredictionModeNearMv},
+ {kPredictionModeGlobalMv, kPredictionModeGlobalMv},
+ {kPredictionModeNewMv, kPredictionModeNewMv},
+};
+PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
+ if (y_mode < kPredictionModeNearestNearestMv) {
+ return y_mode;
+ }
+ const int lookup_index = y_mode - kPredictionModeNearestNearestMv;
+ assert(lookup_index >= 0);
+ return kCompoundToSinglePredictionMode[lookup_index][index];
+}
+
+// log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
+// dqDenom is always a power of two and hence right shift can be used instead of
+// division.
+constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
+
+// Returns the minimum of |length| or |max|-|start|. This is used to clamp array
+// indices when accessing arrays whose bound is equal to |max|.
+int GetNumElements(int length, int start, int max) {
+ return std::min(length, max - start);
+}
+
+template <typename T>
+void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+ // Specialize all columns cases (values in kTransformWidth4x4[]) for better
+ // performance.
+ switch (columns) {
+ case 1:
+ MemSetBlock<T>(rows, 1, value, dst, stride);
+ break;
+ case 2:
+ MemSetBlock<T>(rows, 2, value, dst, stride);
+ break;
+ case 4:
+ MemSetBlock<T>(rows, 4, value, dst, stride);
+ break;
+ case 8:
+ MemSetBlock<T>(rows, 8, value, dst, stride);
+ break;
+ default:
+ assert(columns == 16);
+ MemSetBlock<T>(rows, 16, value, dst, stride);
+ break;
+ }
+}
+
+void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
+ TransformType tx_type,
+ TransformType transform_types[32][32]) {
+ const int y_offset = y4 - block.row4x4;
+ const int x_offset = x4 - block.column4x4;
+ TransformType* const dst = &transform_types[y_offset][x_offset];
+ SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
+}
+
+void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
+ const MotionVector& mv_to_store, ptrdiff_t stride,
+ int rows, int columns,
+ ReferenceFrameType* reference_frame_row_start,
+ MotionVector* mv) {
+ static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
+ do {
+ // Don't switch the following two memory setting functions.
+ // Some ARM CPUs are quite sensitive to the order.
+ memset(reference_frame_row_start, reference_frame_to_store, columns);
+ std::fill(mv, mv + columns, mv_to_store);
+ reference_frame_row_start += stride;
+ mv += stride;
+ } while (--rows != 0);
+}
+
+// Inverse transform process assumes that the quantized coefficients are stored
+// as a virtual 2d array of size |tx_width| x tx_height. If transform width is
+// 64, then this assumption is broken because the scan order used for populating
+// the coefficients for such transforms is the same as the one used for
+// corresponding transform with width 32 (e.g. the scan order used for 64x16 is
+// the same as the one used for 32x16). So we must restore the coefficients to
+// their correct positions and clean the positions they occupied.
+template <typename ResidualType>
+void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
+ ResidualType* residual) {
+ if (tx_width != 64) return;
+ const int rows = clamped_tx_height - 2;
+ auto* src = residual + 32 * rows;
+ residual += 64 * rows;
+ // Process 2 rows in each loop in reverse order to avoid overwrite.
+ int x = rows >> 1;
+ do {
+ // The 2 rows can be processed in order.
+ memcpy(residual, src, 32 * sizeof(src[0]));
+ memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+ memset(src + 32, 0, 32 * sizeof(src[0]));
+ src -= 64;
+ residual -= 128;
+ } while (--x);
+ // Process the second row. The first row is already correct.
+ memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+ memset(src + 32, 0, 32 * sizeof(src[0]));
+}
+
+void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
+ // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
+ // and 5.11.54).
+ constexpr int kMvBorder4x4 = 4;
+ const int row_border = kMvBorder4x4 + block.height4x4;
+ const int column_border = kMvBorder4x4 + block.width4x4;
+ const int macroblocks_to_top_edge = -block.row4x4;
+ const int macroblocks_to_bottom_edge =
+ block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
+ const int macroblocks_to_left_edge = -block.column4x4;
+ const int macroblocks_to_right_edge =
+ block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
+ min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
+ min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
+ max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
+ max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
+}
+
+// Section 8.3.2 in the spec, under coeff_base_eob.
+int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
+ if (index == 0) return 0;
+ const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+ const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+ const int tx_height = kTransformHeight[adjusted_tx_size];
+ if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
+ if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
+ return 3;
+}
+
+// Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
+// on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
+// the end of block case.
+int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
+ TransformClass tx_class) {
+ if (pos == 0) return 0;
+ const int tx_width = 1 << adjusted_tx_width_log2;
+ const int row = pos >> adjusted_tx_width_log2;
+ const int column = pos & (tx_width - 1);
+ // This return statement is equivalent to:
+ // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
+ // (tx_class == kTransformClassHorizontal && column == 0) ||
+ // (tx_class == kTransformClassVertical && row == 0))
+ // ? 7
+ // : 14;
+ return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
+ static_cast<int>((row | column) < 2)) |
+ (tx_class & static_cast<int>(column == 0)) |
+ ((tx_class >> 1) & static_cast<int>(row == 0)));
+}
+
+} // namespace
+
+Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ RefCountedBuffer* const current_frame, const DecoderState& state,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
+ SymbolDecoderContext* const saved_symbol_decoder_context,
+ const SegmentationMap* prev_segment_ids,
+ PostFilter* const post_filter, const dsp::Dsp* const dsp,
+ ThreadPool* const thread_pool,
+ BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+ bool use_intra_prediction_buffer)
+ : number_(tile_number),
+ row_(number_ / frame_header.tile_info.tile_columns),
+ column_(number_ % frame_header.tile_info.tile_columns),
+ data_(data),
+ size_(size),
+ read_deltas_(false),
+ subsampling_x_{0, sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_x},
+ subsampling_y_{0, sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.subsampling_y},
+ current_quantizer_index_(frame_header.quantizer.base_index),
+ sequence_header_(sequence_header),
+ frame_header_(frame_header),
+ reference_frame_sign_bias_(state.reference_frame_sign_bias),
+ reference_frames_(state.reference_frame),
+ motion_field_(frame_scratch_buffer->motion_field),
+ reference_order_hint_(state.reference_order_hint),
+ wedge_masks_(wedge_masks),
+ quantizer_matrix_(quantizer_matrix),
+ reader_(data_, size_, frame_header_.enable_cdf_update),
+ symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
+ saved_symbol_decoder_context_(saved_symbol_decoder_context),
+ prev_segment_ids_(prev_segment_ids),
+ dsp_(*dsp),
+ post_filter_(*post_filter),
+ block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
+ quantizer_(sequence_header_.color_config.bitdepth,
+ &frame_header_.quantizer),
+ residual_size_((sequence_header_.color_config.bitdepth == 8)
+ ? sizeof(int16_t)
+ : sizeof(int32_t)),
+ intra_block_copy_lag_(
+ frame_header_.allow_intrabc
+ ? (sequence_header_.use_128x128_superblock ? 3 : 5)
+ : 1),
+ current_frame_(*current_frame),
+ cdef_index_(frame_scratch_buffer->cdef_index),
+ inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+ thread_pool_(thread_pool),
+ residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
+ tile_scratch_buffer_pool_(
+ &frame_scratch_buffer->tile_scratch_buffer_pool),
+ pending_tiles_(pending_tiles),
+ frame_parallel_(frame_parallel),
+ use_intra_prediction_buffer_(use_intra_prediction_buffer),
+ intra_prediction_buffer_(
+ use_intra_prediction_buffer_
+ ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
+ : nullptr) {
+ row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
+ row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
+ column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
+ column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1];
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()];
+ superblock_rows_ =
+ (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2;
+ superblock_columns_ =
+ (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
+ block_width4x4_log2;
+ // If |split_parse_and_decode_| is true, we do the necessary setup for
+ // splitting the parsing and the decoding steps. This is done in the following
+ // two cases:
+ // 1) If there is multi-threading within a tile (this is done if
+ // |thread_pool_| is not nullptr and if there are at least as many
+ // superblock columns as |intra_block_copy_lag_|).
+ // 2) If |frame_parallel| is true.
+ split_parse_and_decode_ = (thread_pool_ != nullptr &&
+ superblock_columns_ > intra_block_copy_lag_) ||
+ frame_parallel;
+ if (frame_parallel_) {
+ reference_frame_progress_cache_.fill(INT_MIN);
+ }
+ memset(delta_lf_, 0, sizeof(delta_lf_));
+ delta_lf_all_zero_ = true;
+ const YuvBuffer& buffer = post_filter_.frame_buffer();
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+ // Verify that the borders are big enough for Reconstruct(). max_tx_length
+ // is the maximum value of tx_width and tx_height for the plane.
+ const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
+ // Reconstruct() may overwrite on the right. Since the right border of a
+ // row is followed in memory by the left border of the next row, the
+ // number of extra pixels to the right of a row is at least the sum of the
+ // left and right borders.
+ //
+ // Note: This assertion actually checks the sum of the left and right
+ // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
+ // and vertically shifted version of |buffer|. Since the sum of the left and
+ // right borders is not changed by the shift, we can just check the sum of
+ // the left and right borders of |buffer|.
+ assert(buffer.left_border(plane) + buffer.right_border(plane) >=
+ max_tx_length - 1);
+ // Reconstruct() may overwrite on the bottom. We need an extra border row
+ // on the bottom because we need the left border of that row.
+ //
+ // Note: This assertion checks the bottom border of
+ // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
+ // shift that the PostFilter constructor applied to |buffer| and reduce the
+ // bottom border by that amount.
+#ifndef NDEBUG
+ const int vertical_shift = static_cast<int>(
+ (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
+ buffer.stride(plane));
+ const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
+ assert(bottom_border >= max_tx_length);
+#endif
+ // In AV1, a transform block of height H starts at a y coordinate that is
+ // a multiple of H. If a transform block at the bottom of the frame has
+ // height H, then Reconstruct() will write up to the row with index
+ // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
+ // rows Reconstruct() may write to is
+ // Align(buffer.height(plane), max_tx_length).
+ buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
+ buffer.stride(plane),
+ post_filter_.GetUnfilteredBuffer(plane));
+ const int plane_height =
+ SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+ deblock_row_limit_[plane] =
+ std::min(frame_header_.rows4x4, DivideBy4(plane_height + 3)
+ << subsampling_y_[plane]);
+ const int plane_width =
+ SubsampledValue(frame_header_.width, subsampling_x_[plane]);
+ deblock_column_limit_[plane] =
+ std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3)
+ << subsampling_x_[plane]);
+ }
+}
+
+bool Tile::Init() {
+ assert(coefficient_levels_.size() == dc_categories_.size());
+ for (size_t i = 0; i < coefficient_levels_.size(); ++i) {
+ const int contexts_per_plane = (i == kEntropyContextLeft)
+ ? frame_header_.rows4x4
+ : frame_header_.columns4x4;
+ if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) {
+ LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i);
+ return false;
+ }
+ if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) {
+ LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i);
+ return false;
+ }
+ }
+ if (split_parse_and_decode_) {
+ assert(residual_buffer_pool_ != nullptr);
+ if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_,
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed.");
+ return false;
+ }
+ } else {
+ // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
+ // checks when parsing quantized coefficients.
+ residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
+ 32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
+ if (residual_buffer_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
+ return false;
+ }
+ prediction_parameters_.reset(new (std::nothrow) PredictionParameters());
+ if (prediction_parameters_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed.");
+ return false;
+ }
+ }
+ if (frame_header_.use_ref_frame_mvs) {
+ assert(sequence_header_.enable_order_hint);
+ SetupMotionField(frame_header_, current_frame_, reference_frames_,
+ row4x4_start_, row4x4_end_, column4x4_start_,
+ column4x4_end_, &motion_field_);
+ }
+ ResetLoopRestorationParams();
+ return true;
+}
+
+template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+bool Tile::ProcessSuperBlockRow(int row4x4,
+ TileScratchBuffer* const scratch_buffer) {
+ if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
+ assert(scratch_buffer != nullptr);
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
+ column4x4 += block_width4x4) {
+ if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer,
+ processing_mode)) {
+ LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
+ row4x4, column4x4);
+ return false;
+ }
+ }
+ if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
+ SaveSymbolDecoderContext();
+ }
+ if (processing_mode == kProcessingModeDecodeOnly ||
+ processing_mode == kProcessingModeParseAndDecode) {
+ PopulateIntraPredictionBuffer(row4x4);
+ }
+ return true;
+}
+
+// Used in frame parallel mode. The symbol decoder context need not be saved in
+// this case since it was done when parsing was complete.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ int row4x4, TileScratchBuffer* scratch_buffer);
+// Used in non frame parallel mode.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ int row4x4, TileScratchBuffer* scratch_buffer);
+
+void Tile::SaveSymbolDecoderContext() {
+ if (frame_header_.enable_frame_end_update_cdf &&
+ number_ == frame_header_.tile_info.context_update_id) {
+ *saved_symbol_decoder_context_ = symbol_decoder_context_;
+ }
+}
+
+bool Tile::ParseAndDecode() {
+ // If this is the main thread, we build the loop filter bit masks when parsing
+ // so that it happens in the current thread. This ensures that the main thread
+ // does as much work as possible.
+ if (split_parse_and_decode_) {
+ if (!ThreadedParseAndDecode()) return false;
+ SaveSymbolDecoderContext();
+ return true;
+ }
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ pending_tiles_->Decrement(false);
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+ row4x4 += block_width4x4) {
+ if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ row4x4, scratch_buffer.get())) {
+ pending_tiles_->Decrement(false);
+ return false;
+ }
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ pending_tiles_->Decrement(true);
+ return true;
+}
+
+bool Tile::Parse() {
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+ row4x4 += block_width4x4) {
+ if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
+ row4x4, scratch_buffer.get())) {
+ return false;
+ }
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ SaveSymbolDecoderContext();
+ return true;
+}
+
+bool Tile::Decode(
+ std::mutex* const mutex, int* const superblock_row_progress,
+ std::condition_variable* const superblock_row_progress_condvar) {
+ const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+ const int block_width4x4_log2 =
+ sequence_header_.use_128x128_superblock ? 5 : 4;
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
+ row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
+ if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, scratch_buffer.get())) {
+ return false;
+ }
+ if (post_filter_.DoDeblock()) {
+ // Apply vertical deblock filtering for all the columns in this tile
+ // except for the first 64 columns.
+ post_filter_.ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4,
+ column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
+ block_width4x4);
+ // If this is the first superblock row of the tile, then we cannot apply
+ // horizontal deblocking here since we don't know if the top row is
+ // available. So it will be done by the calling thread in that case.
+ if (row4x4 != row4x4_start_) {
+ // Apply horizontal deblock filtering for all the columns in this tile
+ // except for the first and the last 64 columns.
+ // Note about the last tile of each row: For the last tile,
+ // column4x4_end may not be a multiple of 16. In that case it is still
+ // okay to simply subtract 16 since ApplyDeblockFilter() will only do
+ // the filters in increments of 64 columns (or 32 columns for chroma
+ // with subsampling).
+ post_filter_.ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4,
+ column4x4_start_ + kNum4x4InLoopFilterUnit,
+ column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ }
+ bool notify;
+ {
+ std::unique_lock<std::mutex> lock(*mutex);
+ notify = ++superblock_row_progress[index] ==
+ frame_header_.tile_info.tile_columns;
+ }
+ if (notify) {
+ // We are done decoding this superblock row. Notify the post filtering
+ // thread.
+ superblock_row_progress_condvar[index].notify_one();
+ }
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ return true;
+}
+
+bool Tile::ThreadedParseAndDecode() {
+ {
+ std::lock_guard<std::mutex> lock(threading_.mutex);
+ if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
+ pending_tiles_->Decrement(false);
+ LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
+ return false;
+ }
+ // Account for the parsing job.
+ ++threading_.pending_jobs;
+ }
+
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+
+ // Begin parsing.
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ pending_tiles_->Decrement(false);
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_;
+ row4x4 += block_width4x4, ++row_index) {
+ for (int column4x4 = column4x4_start_, column_index = 0;
+ column4x4 < column4x4_end_;
+ column4x4 += block_width4x4, ++column_index) {
+ if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4,
+ scratch_buffer.get(), kProcessingModeParseOnly)) {
+ std::lock_guard<std::mutex> lock(threading_.mutex);
+ threading_.abort = true;
+ break;
+ }
+ std::unique_lock<std::mutex> lock(threading_.mutex);
+ if (threading_.abort) break;
+ threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed;
+ // Schedule the decoding of this superblock if it is allowed.
+ if (CanDecode(row_index, column_index)) {
+ ++threading_.pending_jobs;
+ threading_.sb_state[row_index][column_index] =
+ kSuperBlockStateScheduled;
+ lock.unlock();
+ thread_pool_->Schedule(
+ [this, row_index, column_index, block_width4x4]() {
+ DecodeSuperBlock(row_index, column_index, block_width4x4);
+ });
+ }
+ }
+ std::lock_guard<std::mutex> lock(threading_.mutex);
+ if (threading_.abort) break;
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+
+ // We are done parsing. We can return here since the calling thread will make
+ // sure that it waits for all the superblocks to be decoded.
+ //
+ // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+ // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+ // is called.
+ threading_.mutex.lock();
+ const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+ const bool job_succeeded = !threading_.abort;
+ threading_.mutex.unlock();
+ if (no_pending_jobs) {
+ // We are done parsing and decoding this tile.
+ pending_tiles_->Decrement(job_succeeded);
+ }
+ return job_succeeded;
+}
+
+bool Tile::CanDecode(int row_index, int column_index) const {
+ assert(row_index >= 0);
+ assert(column_index >= 0);
+ // If |threading_.sb_state[row_index][column_index]| is not equal to
+ // kSuperBlockStateParsed, then return false. This is ok because if
+ // |threading_.sb_state[row_index][column_index]| is equal to:
+ // kSuperBlockStateNone - then the superblock is not yet parsed.
+ // kSuperBlockStateScheduled - then the superblock is already scheduled for
+ // decode.
+ // kSuperBlockStateDecoded - then the superblock has already been decoded.
+ if (row_index >= superblock_rows_ || column_index >= superblock_columns_ ||
+ threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) {
+ return false;
+ }
+ // First superblock has no dependencies.
+ if (row_index == 0 && column_index == 0) {
+ return true;
+ }
+ // Superblocks in the first row only depend on the superblock to the left of
+ // it.
+ if (row_index == 0) {
+ return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded;
+ }
+ // All other superblocks depend on superblock to the left of it (if one
+ // exists) and superblock to the top right with a lag of
+ // |intra_block_copy_lag_| (if one exists).
+ const int top_right_column_index =
+ std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1);
+ return threading_.sb_state[row_index - 1][top_right_column_index] ==
+ kSuperBlockStateDecoded &&
+ (column_index == 0 ||
+ threading_.sb_state[row_index][column_index - 1] ==
+ kSuperBlockStateDecoded);
+}
+
+void Tile::DecodeSuperBlock(int row_index, int column_index,
+ int block_width4x4) {
+ const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
+ const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ bool ok = scratch_buffer != nullptr;
+ if (ok) {
+ ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4,
+ scratch_buffer.get(), kProcessingModeDecodeOnly);
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ }
+ std::unique_lock<std::mutex> lock(threading_.mutex);
+ if (ok) {
+ threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded;
+ // Candidate rows and columns that we could potentially begin the decoding
+ // (if it is allowed to do so). The candidates are:
+ // 1) The superblock to the bottom-left of the current superblock with a
+ // lag of |intra_block_copy_lag_| (or the beginning of the next superblock
+ // row in case there are less than |intra_block_copy_lag_| superblock
+ // columns in the Tile).
+ // 2) The superblock to the right of the current superblock.
+ const int candidate_row_indices[] = {row_index + 1, row_index};
+ const int candidate_column_indices[] = {
+ std::max(0, column_index - intra_block_copy_lag_), column_index + 1};
+ for (size_t i = 0; i < std::extent<decltype(candidate_row_indices)>::value;
+ ++i) {
+ const int candidate_row_index = candidate_row_indices[i];
+ const int candidate_column_index = candidate_column_indices[i];
+ if (!CanDecode(candidate_row_index, candidate_column_index)) {
+ continue;
+ }
+ ++threading_.pending_jobs;
+ threading_.sb_state[candidate_row_index][candidate_column_index] =
+ kSuperBlockStateScheduled;
+ lock.unlock();
+ thread_pool_->Schedule([this, candidate_row_index, candidate_column_index,
+ block_width4x4]() {
+ DecodeSuperBlock(candidate_row_index, candidate_column_index,
+ block_width4x4);
+ });
+ lock.lock();
+ }
+ } else {
+ threading_.abort = true;
+ }
+ // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+ // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+ // is called.
+ const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+ const bool job_succeeded = !threading_.abort;
+ lock.unlock();
+ if (no_pending_jobs) {
+ // We are done parsing and decoding this tile.
+ pending_tiles_->Decrement(job_succeeded);
+ }
+}
+
+void Tile::PopulateIntraPredictionBuffer(int row4x4) {
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
+ return;
+ }
+ const size_t pixel_size =
+ (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t));
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+ const int row_to_copy =
+ (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
+ const size_t pixels_to_copy =
+ (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
+ subsampling_x_[plane]) *
+ pixel_size;
+ const size_t column_start =
+ MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
+ void* start;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (sequence_header_.color_config.bitdepth > 8) {
+ Array2DView<uint16_t> buffer(
+ buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+ reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+ start = &buffer[row_to_copy][column_start];
+ } else // NOLINT
+#endif
+ {
+ start = &buffer_[plane][row_to_copy][column_start];
+ }
+ memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
+ start, pixels_to_copy);
+ }
+}
+
+int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
+ TransformSize tx_size, int x4, int y4,
+ int w4, int h4) {
+ const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+ const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+
+ const int tx_width = kTransformWidth[tx_size];
+ const int tx_height = kTransformHeight[tx_size];
+ const BlockSize plane_size = block.residual_size[plane];
+ const int block_width = kBlockWidthPixels[plane_size];
+ const int block_height = kBlockHeightPixels[plane_size];
+
+ int top = 0;
+ int left = 0;
+ const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+ const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+ if (plane == kPlaneY) {
+ if (block_width == tx_width && block_height == tx_height) return 0;
+ const uint8_t* coefficient_levels =
+ &coefficient_levels_[kEntropyContextTop][plane][x4];
+ for (int i = 0; i < num_top_elements; ++i) {
+ top = std::max(top, static_cast<int>(coefficient_levels[i]));
+ }
+ coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+ for (int i = 0; i < num_left_elements; ++i) {
+ left = std::max(left, static_cast<int>(coefficient_levels[i]));
+ }
+ assert(top <= 4);
+ assert(left <= 4);
+ // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec
+ // for top and left.
+ return kAllZeroContextsByTopLeft[top][left];
+ }
+ const uint8_t* coefficient_levels =
+ &coefficient_levels_[kEntropyContextTop][plane][x4];
+ const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+ for (int i = 0; i < num_top_elements; ++i) {
+ top |= coefficient_levels[i];
+ top |= dc_categories[i];
+ }
+ coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+ dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+ for (int i = 0; i < num_left_elements; ++i) {
+ left |= coefficient_levels[i];
+ left |= dc_categories[i];
+ }
+ return static_cast<int>(top != 0) + static_cast<int>(left != 0) + 7 +
+ 3 * static_cast<int>(block_width * block_height >
+ tx_width * tx_height);
+}
+
+TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const {
+ const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size];
+ const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+ if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly;
+ if (is_inter) {
+ if (frame_header_.reduced_tx_set ||
+ tx_size_square_max == kTransformSize32x32) {
+ return kTransformSetInter3;
+ }
+ if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2;
+ return kTransformSetInter1;
+ }
+ if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly;
+ if (frame_header_.reduced_tx_set ||
+ tx_size_square_min == kTransformSize16x16) {
+ return kTransformSetIntra2;
+ }
+ return kTransformSetIntra1;
+}
+
+TransformType Tile::ComputeTransformType(const Block& block, Plane plane,
+ TransformSize tx_size, int block_x,
+ int block_y) {
+ const BlockParameters& bp = *block.bp;
+ const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+ if (frame_header_.segmentation.lossless[bp.segment_id] ||
+ tx_size_square_max == kTransformSize64x64) {
+ return kTransformTypeDctDct;
+ }
+ if (plane == kPlaneY) {
+ return transform_types_[block_y - block.row4x4][block_x - block.column4x4];
+ }
+ const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+ TransformType tx_type;
+ if (bp.is_inter) {
+ const int x4 =
+ std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]);
+ const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]);
+ tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4];
+ } else {
+ tx_type = kModeToTransformType[bp.uv_mode];
+ }
+ return kTransformTypeInSetMask[tx_set].Contains(tx_type)
+ ? tx_type
+ : kTransformTypeDctDct;
+}
+
+void Tile::ReadTransformType(const Block& block, int x4, int y4,
+ TransformSize tx_size) {
+ BlockParameters& bp = *block.bp;
+ const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+
+ TransformType tx_type = kTransformTypeDctDct;
+ if (tx_set != kTransformSetDctOnly &&
+ frame_header_.segmentation.qindex[bp.segment_id] > 0) {
+ const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set);
+ const int cdf_tx_size_index =
+ TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]);
+ uint16_t* cdf;
+ if (bp.is_inter) {
+ cdf = symbol_decoder_context_
+ .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
+ switch (tx_set) {
+ case kTransformSetInter1:
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
+ break;
+ case kTransformSetInter2:
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
+ break;
+ default:
+ assert(tx_set == kTransformSetInter3);
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
+ break;
+ }
+ } else {
+ const PredictionMode intra_direction =
+ block.bp->prediction_parameters->use_filter_intra
+ ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters
+ ->filter_intra_mode]
+ : bp.y_mode;
+ cdf =
+ symbol_decoder_context_
+ .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
+ assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
+ tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
+ ? reader_.ReadSymbol<7>(cdf)
+ : reader_.ReadSymbol<5>(cdf));
+ }
+
+ // This array does not contain an entry for kTransformSetDctOnly, so the
+ // first dimension needs to be offset by 1.
+ tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
+ }
+ SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size],
+ kTransformHeight4x4[tx_size], tx_type, transform_types_);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the two right neighbors and the
+// one bottom-right neighbor may be out of boundary. We don't check the right
+// boundary for them, because the out of boundary neighbors project to positions
+// above the diagonal line which goes through the current coefficient and these
+// positions are still all 0s according to the diagonal scan order.
+template <typename ResidualType>
+void Tile::ReadCoeffBase2D(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+ const int tx_width = 1 << adjusted_tx_width_log2;
+ for (int i = eob - 2; i >= 1; --i) {
+ const uint16_t pos = scan[i];
+ const int row = pos >> adjusted_tx_width_log2;
+ const int column = pos & (tx_width - 1);
+ auto* const quantized = &quantized_buffer[pos];
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
+ levels[tx_width + 1] + levels[2] +
+ levels[MultiplyBy2(tx_width)];
+ const int context =
+ ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
+ int level =
+ reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ int context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[tx_width + 1])); // {1, 1}
+ context += 14 >> static_cast<int>((row | column) < 2);
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ }
+ // Read position 0.
+ {
+ auto* const quantized = &quantized_buffer[0];
+ int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
+ level_buffer[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ const int context =
+ std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[tx_width + 1])); // {1, 1}
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ }
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the four right neighbors may be
+// out of boundary. We don't do the boundary check for the first three right
+// neighbors, because even for the transform blocks with smallest width 4, the
+// first three out of boundary neighbors project to positions left of the
+// current coefficient and these positions are still all 0s according to the
+// column scan order. However, when transform block width is 4 and the current
+// coefficient is on the right boundary, its fourth right neighbor projects to
+// the under position on the same column, which could be nonzero. Therefore, we
+// must skip the fourth right neighbor. To make it simple, for any coefficient,
+// we always do the boundary check for its fourth right neighbor.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseHorizontal(
+ const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+ const int tx_width = 1 << adjusted_tx_width_log2;
+ int i = eob - 2;
+ do {
+ const uint16_t pos = scan[i];
+ const int column = pos & (tx_width - 1);
+ auto* const quantized = &quantized_buffer[pos];
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum =
+ 1 + (levels[1] + // {0, 1}
+ levels[tx_width] + // {1, 0}
+ levels[2] + // {0, 2}
+ levels[3] + // {0, 3}
+ ((column + 4 < tx_width) ? levels[4] : 0)); // {0, 4}
+ const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBasePositionContextOffset[column];
+ int level =
+ reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ int context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[2])); // {0, 2}
+ if (pos != 0) {
+ context += 14 >> static_cast<int>(column == 0);
+ }
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ } while (--i >= 0);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// Right boundary check is performed explicitly.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseVertical(
+ const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+ const int tx_width = 1 << adjusted_tx_width_log2;
+ int i = eob - 2;
+ do {
+ const uint16_t pos = scan[i];
+ const int row = pos >> adjusted_tx_width_log2;
+ const int column = pos & (tx_width - 1);
+ auto* const quantized = &quantized_buffer[pos];
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum =
+ 1 + (((column + 1 < tx_width) ? levels[1] : 0) + // {0, 1}
+ levels[tx_width] + // {1, 0}
+ levels[MultiplyBy2(tx_width)] + // {2, 0}
+ levels[tx_width * 3] + // {3, 0}
+ levels[MultiplyBy4(tx_width)]); // {4, 0}
+ const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBasePositionContextOffset[row];
+ int level =
+ reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
+ int context =
+ std::min(6, DivideBy2(1 + quantized_column1 + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[MultiplyBy2(tx_width)])); // {2, 0}
+ if (pos != 0) {
+ context += 14 >> static_cast<int>(row == 0);
+ }
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ } while (--i >= 0);
+}
+
+int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
+ const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+ const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+ // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
+ int8_t dc_sign = std::accumulate(
+ dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
+ const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+ dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+ dc_sign = std::accumulate(
+ dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign);
+ // This return statement is equivalent to:
+ // if (dc_sign < 0) return 1;
+ // if (dc_sign > 0) return 2;
+ // return 0;
+ // And it is better than:
+ // return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
+ return static_cast<int>(dc_sign < 0) +
+ MultiplyBy2(static_cast<int>(dc_sign > 0));
+}
+
+void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+ uint8_t coefficient_level, int8_t dc_category) {
+ const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+ const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+ memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level,
+ num_top_elements);
+ memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category,
+ num_top_elements);
+ const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+ const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+ memset(&coefficient_levels_[kEntropyContextLeft][plane][y4],
+ coefficient_level, num_left_elements);
+ memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category,
+ num_left_elements);
+}
+
+template <typename ResidualType, bool is_dc_coefficient>
+bool Tile::ReadSignAndApplyDequantization(
+ const uint16_t* const scan, int i, int q_value,
+ const uint8_t* const quantizer_matrix, int shift, int max_value,
+ uint16_t* const dc_sign_cdf, int8_t* const dc_category,
+ int* const coefficient_level, ResidualType* residual_buffer) {
+ const int pos = is_dc_coefficient ? 0 : scan[i];
+ // If residual_buffer[pos] is zero, then the rest of the function has no
+ // effect.
+ int level = residual_buffer[pos];
+ if (level == 0) return true;
+ const int sign = is_dc_coefficient
+ ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
+ : reader_.ReadBit();
+ if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
+ int length = 0;
+ bool golomb_length_bit = false;
+ do {
+ golomb_length_bit = static_cast<bool>(reader_.ReadBit());
+ ++length;
+ if (length > 20) {
+ LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length);
+ return false;
+ }
+ } while (!golomb_length_bit);
+ int x = 1;
+ for (int i = length - 2; i >= 0; --i) {
+ x = (x << 1) | reader_.ReadBit();
+ }
+ level += x - 1;
+ }
+ if (is_dc_coefficient) {
+ *dc_category = (sign != 0) ? -1 : 1;
+ }
+ level &= 0xfffff;
+ *coefficient_level += level;
+ // Apply dequantization. Step 1 of section 7.12.3 in the spec.
+ int q = q_value;
+ if (quantizer_matrix != nullptr) {
+ q = RightShiftWithRounding(q * quantizer_matrix[pos], 5);
+ }
+ // The intermediate multiplication can exceed 32 bits, so it has to be
+ // performed by promoting one of the values to int64_t.
+ int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
+ dequantized_value >>= shift;
+ // At this point:
+ // * |dequantized_value| is always non-negative.
+ // * |sign| can be either 0 or 1.
+ // * min_value = -(max_value + 1).
+ // We need to apply the following:
+ // dequantized_value = sign ? -dequantized_value : dequantized_value;
+ // dequantized_value = Clip3(dequantized_value, min_value, max_value);
+ //
+ // Note that -x == ~(x - 1).
+ //
+ // Now, The above two lines can be done with a std::min and xor as follows:
+ dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
+ residual_buffer[pos] = dequantized_value;
+ return true;
+}
+
+int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
+ int level = 0;
+ for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
+ const int coeff_base_range =
+ reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
+ level += coeff_base_range;
+ if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
+ }
+ return level;
+}
+
+template <typename ResidualType>
+int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
+ int start_x, int start_y,
+ TransformSize tx_size,
+ TransformType* const tx_type) {
+ const int x4 = DivideBy4(start_x);
+ const int y4 = DivideBy4(start_y);
+ const int w4 = kTransformWidth4x4[tx_size];
+ const int h4 = kTransformHeight4x4[tx_size];
+ const int tx_size_context = kTransformSizeContext[tx_size];
+ int context =
+ GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4);
+ const bool all_zero = reader_.ReadSymbol(
+ symbol_decoder_context_.all_zero_cdf[tx_size_context][context]);
+ if (all_zero) {
+ if (plane == kPlaneY) {
+ SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct,
+ transform_types_);
+ }
+ SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0);
+ // This is not used in this case, so it can be set to any value.
+ *tx_type = kNumTransformTypes;
+ return 0;
+ }
+ const int tx_width = kTransformWidth[tx_size];
+ const int tx_height = kTransformHeight[tx_size];
+ const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+ const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+ const int tx_padding =
+ (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
+ auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
+ // Clear padding to avoid bottom boundary checks when parsing quantized
+ // coefficients.
+ memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
+ uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
+ memset(
+ level_buffer, 0,
+ kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
+ tx_padding);
+ const int clamped_tx_height = std::min(tx_height, 32);
+ if (plane == kPlaneY) {
+ ReadTransformType(block, x4, y4, tx_size);
+ }
+ BlockParameters& bp = *block.bp;
+ *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4);
+ const int eob_multi_size = kEobMultiSizeLookup[tx_size];
+ const PlaneType plane_type = GetPlaneType(plane);
+ const TransformClass tx_class = GetTransformClass(*tx_type);
+ context = static_cast<int>(tx_class != kTransformClass2D);
+ int eob_pt = 1;
+ switch (eob_multi_size) {
+ case 0:
+ eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
+ symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
+ break;
+ case 1:
+ eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
+ symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
+ break;
+ case 2:
+ eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
+ symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
+ break;
+ case 3:
+ eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
+ symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
+ break;
+ case 4:
+ eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
+ symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
+ break;
+ case 5:
+ eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
+ symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
+ break;
+ case 6:
+ default:
+ eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
+ symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
+ break;
+ }
+ int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
+ if (eob_pt >= 3) {
+ context = eob_pt - 3;
+ const bool eob_extra = reader_.ReadSymbol(
+ symbol_decoder_context_
+ .eob_extra_cdf[tx_size_context][plane_type][context]);
+ if (eob_extra) eob += 1 << (eob_pt - 3);
+ for (int i = 1; i < eob_pt - 2; ++i) {
+ assert(eob_pt - i >= 3);
+ assert(eob_pt <= kEobPt1024SymbolCount);
+ if (static_cast<bool>(reader_.ReadBit())) {
+ eob += 1 << (eob_pt - i - 3);
+ }
+ }
+ }
+ const uint16_t* scan = kScan[tx_class][tx_size];
+ const int clamped_tx_size_context = std::min(tx_size_context, 3);
+ auto coeff_base_range_cdf =
+ symbol_decoder_context_
+ .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
+ // Read the last coefficient.
+ {
+ context = GetCoeffBaseContextEob(tx_size, eob - 1);
+ const uint16_t pos = scan[eob - 1];
+ int level =
+ 1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
+ symbol_decoder_context_
+ .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
+ level_buffer[pos] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ level +=
+ ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
+ adjusted_tx_width_log2, pos, tx_class)]);
+ }
+ residual[pos] = level;
+ }
+ if (eob > 1) {
+ // Read all the other coefficients.
+ // Lookup used to call the right variant of ReadCoeffBase*() based on the
+ // transform class.
+ static constexpr void (Tile::*kGetCoeffBaseFunc[])(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer,
+ uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
+ &Tile::ReadCoeffBaseHorizontal<ResidualType>,
+ &Tile::ReadCoeffBaseVertical<ResidualType>};
+ (this->*kGetCoeffBaseFunc[tx_class])(
+ scan, tx_size, adjusted_tx_width_log2, eob,
+ symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
+ coeff_base_range_cdf, residual, level_buffer);
+ }
+ const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
+ const int current_quantizer_index = GetQIndex(
+ frame_header_.segmentation, bp.segment_id, current_quantizer_index_);
+ const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
+ const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
+ const int shift = kQuantizationShift[tx_size];
+ const uint8_t* const quantizer_matrix =
+ (frame_header_.quantizer.use_matrix &&
+ *tx_type < kTransformTypeIdentityIdentity &&
+ !frame_header_.segmentation.lossless[bp.segment_id] &&
+ frame_header_.quantizer.matrix_level[plane] < 15)
+ ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
+ [plane_type][adjusted_tx_size]
+ .get()
+ : nullptr;
+ int coefficient_level = 0;
+ int8_t dc_category = 0;
+ uint16_t* const dc_sign_cdf =
+ (residual[0] != 0)
+ ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
+ x4, y4, w4, h4, plane)]
+ : nullptr;
+ assert(scan[0] == 0);
+ if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
+ scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
+ &dc_category, &coefficient_level, residual)) {
+ return -1;
+ }
+ if (eob > 1) {
+ int i = 1;
+ do {
+ if (!ReadSignAndApplyDequantization<ResidualType,
+ /*is_dc_coefficient=*/false>(
+ scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
+ nullptr, &coefficient_level, residual)) {
+ return -1;
+ }
+ } while (++i < eob);
+ MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
+ }
+ SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
+ dc_category);
+ if (split_parse_and_decode_) {
+ *block.residual += tx_width * tx_height * residual_size_;
+ }
+ return eob;
+}
+
+// CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
+// |function| depending on the value of |sequence_header_.color_config.bitdepth|
+// with the variadic arguments.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+#define CALL_BITDEPTH_FUNCTION(function, ...) \
+ do { \
+ if (sequence_header_.color_config.bitdepth > 8) { \
+ function<uint16_t>(__VA_ARGS__); \
+ } else { \
+ function<uint8_t>(__VA_ARGS__); \
+ } \
+ } while (false)
+#else
+#define CALL_BITDEPTH_FUNCTION(function, ...) \
+ do { \
+ function<uint8_t>(__VA_ARGS__); \
+ } while (false)
+#endif
+
+bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
+ int base_y, TransformSize tx_size, int x, int y,
+ ProcessingMode mode) {
+ BlockParameters& bp = *block.bp;
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ const int start_x = base_x + MultiplyBy4(x);
+ const int start_y = base_y + MultiplyBy4(y);
+ const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
+ const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
+ if (start_x >= max_x || start_y >= max_y) return true;
+ const int row = DivideBy4(start_y << subsampling_y);
+ const int column = DivideBy4(start_x << subsampling_x);
+ const int mask = sequence_header_.use_128x128_superblock ? 31 : 15;
+ const int sub_block_row4x4 = row & mask;
+ const int sub_block_column4x4 = column & mask;
+ const int step_x = kTransformWidth4x4[tx_size];
+ const int step_y = kTransformHeight4x4[tx_size];
+ const bool do_decode = mode == kProcessingModeDecodeOnly ||
+ mode == kProcessingModeParseAndDecode;
+ if (do_decode && !bp.is_inter) {
+ if (bp.palette_mode_info.size[GetPlaneType(plane)] > 0) {
+ CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
+ x, y, tx_size);
+ } else {
+ const PredictionMode mode =
+ (plane == kPlaneY)
+ ? bp.y_mode
+ : (bp.uv_mode == kPredictionModeChromaFromLuma ? kPredictionModeDc
+ : bp.uv_mode);
+ const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
+ const int tr_column4x4 =
+ (sub_block_column4x4 >> subsampling_x) + step_x + 1;
+ const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
+ const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
+ const bool has_left = x > 0 || block.left_available[plane];
+ const bool has_top = y > 0 || block.top_available[plane];
+
+ CALL_BITDEPTH_FUNCTION(
+ IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
+ block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+ block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+ mode, tx_size);
+ if (plane != kPlaneY && bp.uv_mode == kPredictionModeChromaFromLuma) {
+ CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
+ start_y, tx_size);
+ }
+ }
+ if (plane == kPlaneY) {
+ block.bp->prediction_parameters->max_luma_width =
+ start_x + MultiplyBy4(step_x);
+ block.bp->prediction_parameters->max_luma_height =
+ start_y + MultiplyBy4(step_y);
+ block.scratch_buffer->cfl_luma_buffer_valid = false;
+ }
+ }
+ if (!bp.skip) {
+ const int sb_row_index = SuperBlockRowIndex(block.row4x4);
+ const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
+ if (mode == kProcessingModeDecodeOnly) {
+ TransformParameterQueue& tx_params =
+ *residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->transform_parameters();
+ ReconstructBlock(block, plane, start_x, start_y, tx_size,
+ tx_params.Type(), tx_params.NonZeroCoeffCount());
+ tx_params.Pop();
+ } else {
+ TransformType tx_type;
+ int non_zero_coeff_count;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (sequence_header_.color_config.bitdepth > 8) {
+ non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
+ block, plane, start_x, start_y, tx_size, &tx_type);
+ } else // NOLINT
+#endif
+ {
+ non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
+ block, plane, start_x, start_y, tx_size, &tx_type);
+ }
+ if (non_zero_coeff_count < 0) return false;
+ if (mode == kProcessingModeParseAndDecode) {
+ ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
+ non_zero_coeff_count);
+ } else {
+ assert(mode == kProcessingModeParseOnly);
+ residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->transform_parameters()
+ ->Push(non_zero_coeff_count, tx_type);
+ }
+ }
+ }
+ if (do_decode) {
+ bool* block_decoded =
+ &block.scratch_buffer
+ ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
+ [(sub_block_column4x4 >> subsampling_x) + 1];
+ SetBlockValues<bool>(step_y, step_x, true, block_decoded,
+ TileScratchBuffer::kBlockDecodedStride);
+ }
+ return true;
+}
+
+bool Tile::TransformTree(const Block& block, int start_x, int start_y,
+ BlockSize plane_size, ProcessingMode mode) {
+ assert(plane_size <= kBlock64x64);
+ // Branching factor is 4; Maximum Depth is 4; So the maximum stack size
+ // required is (4 - 1) * 4 + 1 = 13.
+ Stack<TransformTreeNode, 13> stack;
+ // It is okay to cast BlockSize to TransformSize here since the enum are
+ // equivalent for all BlockSize values <= kBlock64x64.
+ stack.Push(TransformTreeNode(start_x, start_y,
+ static_cast<TransformSize>(plane_size)));
+
+ do {
+ TransformTreeNode node = stack.Pop();
+ const int row = DivideBy4(node.y);
+ const int column = DivideBy4(node.x);
+ if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) {
+ continue;
+ }
+ const TransformSize inter_tx_size = inter_transform_sizes_[row][column];
+ const int width = kTransformWidth[node.tx_size];
+ const int height = kTransformHeight[node.tx_size];
+ if (width <= kTransformWidth[inter_tx_size] &&
+ height <= kTransformHeight[inter_tx_size]) {
+ if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0,
+ mode)) {
+ return false;
+ }
+ continue;
+ }
+ // The split transform size look up gives the right transform size that we
+ // should push in the stack.
+ // if (width > height) => transform size whose width is half.
+ // if (width < height) => transform size whose height is half.
+ // if (width == height) => transform size whose width and height are half.
+ const TransformSize split_tx_size = kSplitTransformSize[node.tx_size];
+ const int half_width = DivideBy2(width);
+ if (width > height) {
+ stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+ stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+ continue;
+ }
+ const int half_height = DivideBy2(height);
+ if (width < height) {
+ stack.Push(
+ TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+ stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+ continue;
+ }
+ stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height,
+ split_tx_size));
+ stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+ stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+ stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+ } while (!stack.Empty());
+ return true;
+}
+
+void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
+ int start_y, TransformSize tx_size,
+ TransformType tx_type, int non_zero_coeff_count) {
+ // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
+ assert(non_zero_coeff_count >= 0);
+ if (non_zero_coeff_count == 0) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (sequence_header_.color_config.bitdepth > 8) {
+ Array2DView<uint16_t> buffer(
+ buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+ reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+ Reconstruct(dsp_, tx_type, tx_size,
+ frame_header_.segmentation.lossless[block.bp->segment_id],
+ reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
+ &buffer, non_zero_coeff_count);
+ } else // NOLINT
+#endif
+ {
+ Reconstruct(dsp_, tx_type, tx_size,
+ frame_header_.segmentation.lossless[block.bp->segment_id],
+ reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
+ &buffer_[plane], non_zero_coeff_count);
+ }
+ if (split_parse_and_decode_) {
+ *block.residual +=
+ kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_;
+ }
+}
+
+bool Tile::Residual(const Block& block, ProcessingMode mode) {
+ const int width_chunks = std::max(1, block.width >> 6);
+ const int height_chunks = std::max(1, block.height >> 6);
+ const BlockSize size_chunk4x4 =
+ (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
+ const BlockParameters& bp = *block.bp;
+ for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
+ for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
+ const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+ int plane = kPlaneY;
+ do {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ // For Y Plane, when lossless is true |bp.transform_size| is always
+ // kTransformSize4x4. So we can simply use |bp.transform_size| here as
+ // the Y plane's transform size (part of Section 5.11.37 in the spec).
+ const TransformSize tx_size =
+ (plane == kPlaneY) ? bp.transform_size : bp.uv_transform_size;
+ const BlockSize plane_size =
+ kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y];
+ assert(plane_size != kBlockInvalid);
+ if (bp.is_inter &&
+ !frame_header_.segmentation.lossless[bp.segment_id] &&
+ plane == kPlaneY) {
+ const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y);
+ const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x);
+ const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x);
+ const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y);
+ if (!TransformTree(block, base_x, base_y, plane_size, mode)) {
+ return false;
+ }
+ } else {
+ const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+ const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+ const int step_x = kTransformWidth4x4[tx_size];
+ const int step_y = kTransformHeight4x4[tx_size];
+ const int num4x4_wide = kNum4x4BlocksWide[plane_size];
+ const int num4x4_high = kNum4x4BlocksHigh[plane_size];
+ for (int y = 0; y < num4x4_high; y += step_y) {
+ for (int x = 0; x < num4x4_wide; x += step_x) {
+ if (!TransformBlock(
+ block, static_cast<Plane>(plane), base_x, base_y, tx_size,
+ x + (MultiplyBy16(chunk_x) >> subsampling_x),
+ y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) {
+ return false;
+ }
+ }
+ }
+ }
+ } while (++plane < num_planes);
+ }
+ }
+ return true;
+}
+
+// The purpose of this function is to limit the maximum size of motion vectors
+// and also, if use_intra_block_copy is true, to additionally constrain the
+// motion vector so that the data is fetched from parts of the tile that have
+// already been decoded and are not too close to the current block (in order to
+// make a pipelined decoder implementation feasible).
+bool Tile::IsMvValid(const Block& block, bool is_compound) const {
+ const BlockParameters& bp = *block.bp;
+ for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
+ for (int mv_component : bp.mv.mv[i].mv) {
+ if (std::abs(mv_component) >= (1 << 14)) {
+ return false;
+ }
+ }
+ }
+ if (!block.bp->prediction_parameters->use_intra_block_copy) {
+ return true;
+ }
+ if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
+ return false;
+ }
+ const int delta_row = bp.mv.mv[0].mv[0] >> 3;
+ const int delta_column = bp.mv.mv[0].mv[1] >> 3;
+ int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
+ int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
+ const int src_bottom_edge = src_top_edge + block.height;
+ const int src_right_edge = src_left_edge + block.width;
+ if (block.HasChroma()) {
+ if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
+ src_left_edge -= 4;
+ }
+ if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
+ src_top_edge -= 4;
+ }
+ }
+ if (src_top_edge < MultiplyBy4(row4x4_start_) ||
+ src_left_edge < MultiplyBy4(column4x4_start_) ||
+ src_bottom_edge > MultiplyBy4(row4x4_end_) ||
+ src_right_edge > MultiplyBy4(column4x4_end_)) {
+ return false;
+ }
+ // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64)
+ const int sb_height_log2 =
+ 6 + static_cast<int>(sequence_header_.use_128x128_superblock);
+ const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2;
+ const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6;
+ const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2;
+ const int src_64x64_block_column = (src_right_edge - 1) >> 6;
+ const int total_64x64_blocks_per_row =
+ ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1;
+ const int active_64x64_block =
+ active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column;
+ const int src_64x64_block =
+ src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column;
+ if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) {
+ return false;
+ }
+
+ // Wavefront constraint: use only top left area of frame for reference.
+ if (src_sb_row > active_sb_row) return false;
+ const int gradient =
+ 1 + kIntraBlockCopyDelay64x64Blocks +
+ static_cast<int>(sequence_header_.use_128x128_superblock);
+ const int wavefront_offset = gradient * (active_sb_row - src_sb_row);
+ return src_64x64_block_column < active_64x64_block_column -
+ kIntraBlockCopyDelay64x64Blocks +
+ wavefront_offset;
+}
+
+bool Tile::AssignInterMv(const Block& block, bool is_compound) {
+ int min[2];
+ int max[2];
+ GetClampParameters(block, min, max);
+ BlockParameters& bp = *block.bp;
+ const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ if (is_compound) {
+ for (int i = 0; i < 2; ++i) {
+ const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
+ MotionVector predicted_mv;
+ if (mode == kPredictionModeGlobalMv) {
+ predicted_mv = prediction_parameters.global_mv[i];
+ } else {
+ const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+ (mode == kPredictionModeNewMv &&
+ prediction_parameters.ref_mv_count <= 1))
+ ? 0
+ : prediction_parameters.ref_mv_index;
+ predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
+ if (ref_mv_index < prediction_parameters.ref_mv_count) {
+ predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+ predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+ }
+ }
+ if (mode == kPredictionModeNewMv) {
+ ReadMotionVector(block, i);
+ bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
+ bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
+ } else {
+ bp.mv.mv[i] = predicted_mv;
+ }
+ }
+ } else {
+ const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
+ MotionVector predicted_mv;
+ if (mode == kPredictionModeGlobalMv) {
+ predicted_mv = prediction_parameters.global_mv[0];
+ } else {
+ const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+ (mode == kPredictionModeNewMv &&
+ prediction_parameters.ref_mv_count <= 1))
+ ? 0
+ : prediction_parameters.ref_mv_index;
+ predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
+ if (ref_mv_index < prediction_parameters.ref_mv_count) {
+ predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+ predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+ }
+ }
+ if (mode == kPredictionModeNewMv) {
+ ReadMotionVector(block, 0);
+ bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
+ bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
+ } else {
+ bp.mv.mv[0] = predicted_mv;
+ }
+ }
+ return IsMvValid(block, is_compound);
+}
+
+bool Tile::AssignIntraMv(const Block& block) {
+ // TODO(linfengz): Check if the clamping process is necessary.
+ int min[2];
+ int max[2];
+ GetClampParameters(block, min, max);
+ BlockParameters& bp = *block.bp;
+ const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+ ReadMotionVector(block, 0);
+ if (ref_mv_0.mv32 == 0) {
+ const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
+ if (ref_mv_1.mv32 == 0) {
+ const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
+ if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
+ bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
+ bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
+ } else {
+ bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
+ }
+ } else {
+ bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
+ bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
+ }
+ } else {
+ bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
+ bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
+ }
+ return IsMvValid(block, /*is_compound=*/false);
+}
+
+void Tile::ResetEntropyContext(const Block& block) {
+ const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+ int plane = kPlaneY;
+ do {
+ const int subsampling_x = subsampling_x_[plane];
+ const int start_x = block.column4x4 >> subsampling_x;
+ const int end_x =
+ std::min((block.column4x4 + block.width4x4) >> subsampling_x,
+ frame_header_.columns4x4);
+ memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
+ end_x - start_x);
+ memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0,
+ end_x - start_x);
+ const int subsampling_y = subsampling_y_[plane];
+ const int start_y = block.row4x4 >> subsampling_y;
+ const int end_y =
+ std::min((block.row4x4 + block.height4x4) >> subsampling_y,
+ frame_header_.rows4x4);
+ memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
+ end_y - start_y);
+ memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
+ end_y - start_y);
+ } while (++plane < num_planes);
+}
+
+bool Tile::ComputePrediction(const Block& block) {
+ const BlockParameters& bp = *block.bp;
+ if (!bp.is_inter) return true;
+ const int mask =
+ (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
+ 1;
+ const int sub_block_row4x4 = block.row4x4 & mask;
+ const int sub_block_column4x4 = block.column4x4 & mask;
+ const int plane_count = block.HasChroma() ? PlaneCount() : 1;
+ // Returns true if this block applies local warping. The state is determined
+ // in the Y plane and carried for use in the U/V planes.
+ // But the U/V planes will not apply warping when the block size is smaller
+ // than 8x8, even if this variable is true.
+ bool is_local_valid = false;
+ // Local warping parameters, similar usage as is_local_valid.
+ GlobalMotion local_warp_params;
+ int plane = kPlaneY;
+ do {
+ const int8_t subsampling_x = subsampling_x_[plane];
+ const int8_t subsampling_y = subsampling_y_[plane];
+ const BlockSize plane_size = block.residual_size[plane];
+ const int block_width4x4 = kNum4x4BlocksWide[plane_size];
+ const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
+ const int block_width = MultiplyBy4(block_width4x4);
+ const int block_height = MultiplyBy4(block_height4x4);
+ const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+ const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+ if (bp.reference_frame[1] == kReferenceFrameIntra) {
+ const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
+ const int tr_column4x4 =
+ (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
+ const int bl_row4x4 =
+ (sub_block_row4x4 >> subsampling_y) + block_height4x4;
+ const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1;
+ const TransformSize tx_size =
+ k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
+ [k4x4HeightLog2[plane_size]];
+ const bool has_left = block.left_available[plane];
+ const bool has_top = block.top_available[plane];
+ CALL_BITDEPTH_FUNCTION(
+ IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
+ has_left, has_top,
+ block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+ block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+ kInterIntraToIntraMode[block.bp->prediction_parameters
+ ->inter_intra_mode],
+ tx_size);
+ }
+ int candidate_row = block.row4x4;
+ int candidate_column = block.column4x4;
+ bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
+ if (!some_use_intra && plane != 0) {
+ candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
+ candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
+ if (candidate_row != block.row4x4) {
+ // Top block.
+ const BlockParameters& bp_top =
+ *block_parameters_holder_.Find(candidate_row, block.column4x4);
+ some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
+ if (!some_use_intra && candidate_column != block.column4x4) {
+ // Top-left block.
+ const BlockParameters& bp_top_left =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ some_use_intra =
+ bp_top_left.reference_frame[0] == kReferenceFrameIntra;
+ }
+ }
+ if (!some_use_intra && candidate_column != block.column4x4) {
+ // Left block.
+ const BlockParameters& bp_left =
+ *block_parameters_holder_.Find(block.row4x4, candidate_column);
+ some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
+ }
+ }
+ int prediction_width;
+ int prediction_height;
+ if (some_use_intra) {
+ candidate_row = block.row4x4;
+ candidate_column = block.column4x4;
+ prediction_width = block_width;
+ prediction_height = block_height;
+ } else {
+ prediction_width = block.width >> subsampling_x;
+ prediction_height = block.height >> subsampling_y;
+ }
+ int r = 0;
+ int y = 0;
+ do {
+ int c = 0;
+ int x = 0;
+ do {
+ if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
+ base_y + y, prediction_width, prediction_height,
+ candidate_row + r, candidate_column + c,
+ &is_local_valid, &local_warp_params)) {
+ return false;
+ }
+ ++c;
+ x += prediction_width;
+ } while (x < block_width);
+ ++r;
+ y += prediction_height;
+ } while (y < block_height);
+ } while (++plane < plane_count);
+ return true;
+}
+
+#undef CALL_BITDEPTH_FUNCTION
+
+void Tile::PopulateDeblockFilterLevel(const Block& block) {
+ if (!post_filter_.DoDeblock()) return;
+ BlockParameters& bp = *block.bp;
+ const int mode_id =
+ static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
+ for (int i = 0; i < kFrameLfCount; ++i) {
+ if (delta_lf_all_zero_) {
+ bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
+ bp.segment_id, i, bp.reference_frame[0], mode_id);
+ } else {
+ bp.deblock_filter_level[i] =
+ deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]]
+ [mode_id];
+ }
+ }
+}
+
+bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+ ParameterTree* const tree,
+ TileScratchBuffer* const scratch_buffer,
+ ResidualPtr* residual) {
+ // Do not process the block if the starting point is beyond the visible frame.
+ // This is equivalent to the has_row/has_column check in the
+ // decode_partition() section of the spec when partition equals
+ // kPartitionHorizontal or kPartitionVertical.
+ if (row4x4 >= frame_header_.rows4x4 ||
+ column4x4 >= frame_header_.columns4x4) {
+ return true;
+ }
+ BlockParameters& bp = *tree->parameters();
+ block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp);
+ Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
+ bp.size = block_size;
+ bp.prediction_parameters =
+ split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
+ new (std::nothrow) PredictionParameters())
+ : std::move(prediction_parameters_);
+ if (bp.prediction_parameters == nullptr) return false;
+ if (!DecodeModeInfo(block)) return false;
+ bp.is_global_mv_block = (bp.y_mode == kPredictionModeGlobalMv ||
+ bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+ !IsBlockDimension4(bp.size);
+ PopulateDeblockFilterLevel(block);
+ if (!ReadPaletteTokens(block)) return false;
+ DecodeTransformSize(block);
+ // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
+ bp.uv_transform_size = frame_header_.segmentation.lossless[bp.segment_id]
+ ? kTransformSize4x4
+ : kUVTransformSize[block.residual_size[kPlaneU]];
+ if (bp.skip) ResetEntropyContext(block);
+ if (split_parse_and_decode_) {
+ if (!Residual(block, kProcessingModeParseOnly)) return false;
+ } else {
+ if (!ComputePrediction(block) ||
+ !Residual(block, kProcessingModeParseAndDecode)) {
+ return false;
+ }
+ }
+ // If frame_header_.segmentation.enabled is false, bp.segment_id is 0 for all
+ // blocks. We don't need to call save bp.segment_id in the current frame
+ // because the current frame's segmentation map will be cleared to all 0s.
+ //
+ // If frame_header_.segmentation.enabled is true and
+ // frame_header_.segmentation.update_map is false, we will copy the previous
+ // frame's segmentation map to the current frame. So we don't need to call
+ // save bp.segment_id in the current frame.
+ if (frame_header_.segmentation.enabled &&
+ frame_header_.segmentation.update_map) {
+ const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
+ static_cast<int>(block.width4x4));
+ const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
+ static_cast<int>(block.height4x4));
+ current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit,
+ y_limit, bp.segment_id);
+ }
+ StoreMotionFieldMvsIntoCurrentFrame(block);
+ if (!split_parse_and_decode_) {
+ prediction_parameters_ = std::move(bp.prediction_parameters);
+ }
+ return true;
+}
+
+bool Tile::DecodeBlock(ParameterTree* const tree,
+ TileScratchBuffer* const scratch_buffer,
+ ResidualPtr* residual) {
+ const int row4x4 = tree->row4x4();
+ const int column4x4 = tree->column4x4();
+ if (row4x4 >= frame_header_.rows4x4 ||
+ column4x4 >= frame_header_.columns4x4) {
+ return true;
+ }
+ const BlockSize block_size = tree->block_size();
+ Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
+ if (!ComputePrediction(block) ||
+ !Residual(block, kProcessingModeDecodeOnly)) {
+ return false;
+ }
+ block.bp->prediction_parameters.reset(nullptr);
+ return true;
+}
+
+bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
+ ParameterTree* const root,
+ TileScratchBuffer* const scratch_buffer,
+ ResidualPtr* residual) {
+ Stack<ParameterTree*, kDfsStackSize> stack;
+
+ // Set up the first iteration.
+ ParameterTree* node = root;
+ int row4x4 = row4x4_start;
+ int column4x4 = column4x4_start;
+ BlockSize block_size = SuperBlockSize();
+
+ // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
+ // Otherwise, the children are pushed into the stack for future processing.
+ do {
+ if (!stack.Empty()) {
+ // Set up subsequent iterations.
+ node = stack.Pop();
+ row4x4 = node->row4x4();
+ column4x4 = node->column4x4();
+ block_size = node->block_size();
+ }
+ if (row4x4 >= frame_header_.rows4x4 ||
+ column4x4 >= frame_header_.columns4x4) {
+ continue;
+ }
+ const int block_width4x4 = kNum4x4BlocksWide[block_size];
+ assert(block_width4x4 == kNum4x4BlocksHigh[block_size]);
+ const int half_block4x4 = block_width4x4 >> 1;
+ const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4;
+ const bool has_columns =
+ (column4x4 + half_block4x4) < frame_header_.columns4x4;
+ Partition partition;
+ if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns,
+ &partition)) {
+ LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d",
+ row4x4, column4x4);
+ return false;
+ }
+ const BlockSize sub_size = kSubSize[partition][block_size];
+ // Section 6.10.4: It is a requirement of bitstream conformance that
+ // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID
+ // every time subSize is computed.
+ if (sub_size == kBlockInvalid ||
+ kPlaneResidualSize[sub_size]
+ [sequence_header_.color_config.subsampling_x]
+ [sequence_header_.color_config.subsampling_y] ==
+ kBlockInvalid) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Invalid sub-block/plane size for row: %d column: %d partition: "
+ "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d",
+ row4x4, column4x4, partition, block_size, sub_size,
+ sequence_header_.color_config.subsampling_x,
+ sequence_header_.color_config.subsampling_y);
+ return false;
+ }
+ if (!node->SetPartitionType(partition)) {
+ LIBGAV1_DLOG(ERROR, "node->SetPartitionType() failed.");
+ return false;
+ }
+ switch (partition) {
+ case kPartitionNone:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, node, scratch_buffer,
+ residual)) {
+ return false;
+ }
+ break;
+ case kPartitionSplit:
+ // The children must be added in reverse order since a stack is being
+ // used.
+ for (int i = 3; i >= 0; --i) {
+ ParameterTree* const child = node->children(i);
+ assert(child != nullptr);
+ stack.Push(child);
+ }
+ break;
+ case kPartitionHorizontal:
+ case kPartitionVertical:
+ case kPartitionHorizontalWithTopSplit:
+ case kPartitionHorizontalWithBottomSplit:
+ case kPartitionVerticalWithLeftSplit:
+ case kPartitionVerticalWithRightSplit:
+ case kPartitionHorizontal4:
+ case kPartitionVertical4:
+ for (int i = 0; i < 4; ++i) {
+ ParameterTree* const child = node->children(i);
+ // Once a null child is seen, all the subsequent children will also be
+ // null.
+ if (child == nullptr) break;
+ if (!ProcessBlock(child->row4x4(), child->column4x4(),
+ child->block_size(), child, scratch_buffer,
+ residual)) {
+ return false;
+ }
+ }
+ break;
+ }
+ } while (!stack.Empty());
+ return true;
+}
+
+void Tile::ResetLoopRestorationParams() {
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+ reference_unit_info_[plane].sgr_proj_info.multiplier[i] =
+ kSgrProjDefaultMultiplier[i];
+ for (int j = 0; j < kNumWienerCoefficients; ++j) {
+ reference_unit_info_[plane].wiener_info.filter[i][j] =
+ kWienerDefaultFilter[j];
+ }
+ }
+ }
+}
+
+void Tile::ResetCdef(const int row4x4, const int column4x4) {
+ if (!sequence_header_.enable_cdef) return;
+ const int row = DivideBy16(row4x4);
+ const int column = DivideBy16(column4x4);
+ cdef_index_[row][column] = -1;
+ if (sequence_header_.use_128x128_superblock) {
+ const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
+ const int border_row = DivideBy16(row4x4 + cdef_size4x4);
+ const int border_column = DivideBy16(column4x4 + cdef_size4x4);
+ cdef_index_[row][border_column] = -1;
+ cdef_index_[border_row][column] = -1;
+ cdef_index_[border_row][border_column] = -1;
+ }
+}
+
+void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
+ int row4x4, int column4x4) {
+ // Set everything to false.
+ memset(scratch_buffer->block_decoded, 0,
+ sizeof(scratch_buffer->block_decoded));
+ // Set specific edge cases to true.
+ const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
+ const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y;
+ // The memset is equivalent to the following lines in the spec:
+ // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) {
+ // if ( y < 0 && x < sbWidth4 ) {
+ // BlockDecoded[plane][y][x] = 1
+ // }
+ // }
+ const int num_elements =
+ std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1;
+ memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements);
+ // The for loop is equivalent to the following lines in the spec:
+ // for ( y = -1; y <= ( sbSize4 >> subY ); y++ )
+ // if ( x < 0 && y < sbHeight4 )
+ // BlockDecoded[plane][y][x] = 1
+ // }
+ // }
+ // BlockDecoded[plane][sbSize4 >> subY][-1] = 0
+ for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4);
+ ++y) {
+ scratch_buffer->block_decoded[plane][y + 1][0] = true;
+ }
+ }
+}
+
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+ TileScratchBuffer* const scratch_buffer,
+ ProcessingMode mode) {
+ const bool parsing =
+ mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
+ const bool decoding = mode == kProcessingModeDecodeOnly ||
+ mode == kProcessingModeParseAndDecode;
+ if (parsing) {
+ read_deltas_ = frame_header_.delta_q.present;
+ ResetCdef(row4x4, column4x4);
+ }
+ if (decoding) {
+ ClearBlockDecoded(scratch_buffer, row4x4, column4x4);
+ }
+ const BlockSize block_size = SuperBlockSize();
+ if (parsing) {
+ ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
+ }
+ const int row = row4x4 / block_width4x4;
+ const int column = column4x4 / block_width4x4;
+ if (parsing && decoding) {
+ uint8_t* residual_buffer = residual_buffer_.get();
+ if (!ProcessPartition(row4x4, column4x4,
+ block_parameters_holder_.Tree(row, column),
+ scratch_buffer, &residual_buffer)) {
+ LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
+ column4x4);
+ return false;
+ }
+ return true;
+ }
+ const int sb_row_index = SuperBlockRowIndex(row4x4);
+ const int sb_column_index = SuperBlockColumnIndex(column4x4);
+ if (parsing) {
+ residual_buffer_threaded_[sb_row_index][sb_column_index] =
+ residual_buffer_pool_->Get();
+ if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get residual buffer.");
+ return false;
+ }
+ uint8_t* residual_buffer =
+ residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+ if (!ProcessPartition(row4x4, column4x4,
+ block_parameters_holder_.Tree(row, column),
+ scratch_buffer, &residual_buffer)) {
+ LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
+ column4x4);
+ return false;
+ }
+ } else {
+ uint8_t* residual_buffer =
+ residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+ if (!DecodeSuperBlock(block_parameters_holder_.Tree(row, column),
+ scratch_buffer, &residual_buffer)) {
+ LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
+ row4x4, column4x4);
+ return false;
+ }
+ residual_buffer_pool_->Release(
+ std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
+ }
+ return true;
+}
+
+bool Tile::DecodeSuperBlock(ParameterTree* const tree,
+ TileScratchBuffer* const scratch_buffer,
+ ResidualPtr* residual) {
+ Stack<ParameterTree*, kDfsStackSize> stack;
+ stack.Push(tree);
+ do {
+ ParameterTree* const node = stack.Pop();
+ if (node->partition() != kPartitionNone) {
+ for (int i = 3; i >= 0; --i) {
+ if (node->children(i) == nullptr) continue;
+ stack.Push(node->children(i));
+ }
+ continue;
+ }
+ if (!DecodeBlock(node, scratch_buffer, residual)) {
+ LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
+ node->row4x4(), node->column4x4());
+ return false;
+ }
+ } while (!stack.Empty());
+ return true;
+}
+
+void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+ BlockSize block_size) {
+ if (frame_header_.allow_intrabc) return;
+ LoopRestorationInfo* const restoration_info = post_filter_.restoration_info();
+ const bool is_superres_scaled =
+ frame_header_.width != frame_header_.upscaled_width;
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+ LoopRestorationUnitInfo unit_info;
+ if (restoration_info->PopulateUnitInfoForSuperBlock(
+ static_cast<Plane>(plane), block_size, is_superres_scaled,
+ frame_header_.superres_scale_denominator, row4x4, column4x4,
+ &unit_info)) {
+ for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end;
+ ++unit_row) {
+ for (int unit_column = unit_info.column_start;
+ unit_column < unit_info.column_end; ++unit_column) {
+ const int unit_id = unit_row * restoration_info->num_horizontal_units(
+ static_cast<Plane>(plane)) +
+ unit_column;
+ restoration_info->ReadUnitCoefficients(
+ &reader_, &symbol_decoder_context_, static_cast<Plane>(plane),
+ unit_id, &reference_unit_info_);
+ }
+ }
+ }
+ }
+}
+
+void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
+ if (frame_header_.refresh_frame_flags == 0 ||
+ IsIntraFrame(frame_header_.frame_type)) {
+ return;
+ }
+ // Iterate over odd rows/columns beginning at the first odd row/column for the
+ // block. It is done this way because motion field mvs are only needed at a
+ // 8x8 granularity.
+ const int row_start4x4 = block.row4x4 | 1;
+ const int row_limit4x4 =
+ std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+ if (row_start4x4 >= row_limit4x4) return;
+ const int column_start4x4 = block.column4x4 | 1;
+ const int column_limit4x4 =
+ std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+ if (column_start4x4 >= column_limit4x4) return;
+
+ // The largest reference MV component that can be saved.
+ constexpr int kRefMvsLimit = (1 << 12) - 1;
+ const BlockParameters& bp = *block.bp;
+ ReferenceInfo* reference_info = current_frame_.reference_info();
+ for (int i = 1; i >= 0; --i) {
+ const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
+ // Must make a local copy so that StoreMotionFieldMvs() knows there is no
+ // overlap between load and store.
+ const MotionVector mv_to_store = bp.mv.mv[i];
+ const int mv_row = std::abs(mv_to_store.mv[MotionVector::kRow]);
+ const int mv_column = std::abs(mv_to_store.mv[MotionVector::kColumn]);
+ if (reference_frame_to_store > kReferenceFrameIntra &&
+ // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two
+ // absolute values and then compare with kRefMvsLimit to save a branch.
+ // The next line is equivalent to:
+ // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
+ (mv_row | mv_column) <= kRefMvsLimit &&
+ reference_info->relative_distance_from[reference_frame_to_store] < 0) {
+ const int row_start8x8 = DivideBy2(row_start4x4);
+ const int row_limit8x8 = DivideBy2(row_limit4x4);
+ const int column_start8x8 = DivideBy2(column_start4x4);
+ const int column_limit8x8 = DivideBy2(column_limit4x4);
+ const int rows = row_limit8x8 - row_start8x8;
+ const int columns = column_limit8x8 - column_start8x8;
+ const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
+ ReferenceFrameType* const reference_frame_row_start =
+ &reference_info
+ ->motion_field_reference_frame[row_start8x8][column_start8x8];
+ MotionVector* const mv =
+ &reference_info->motion_field_mv[row_start8x8][column_start8x8];
+
+ // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
+ // and simplifies std::fill() for these cases.
+ if (columns <= 1) {
+ // Don't change the above condition to (columns == 1).
+ // Condition (columns <= 1) may help the compiler simplify the inlining
+ // of the general case of StoreMotionFieldMvs() by eliminating the
+ // (columns == 0) case.
+ assert(columns == 1);
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 1, reference_frame_row_start, mv);
+ } else if (columns == 2) {
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 2, reference_frame_row_start, mv);
+ } else if (columns == 4) {
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 4, reference_frame_row_start, mv);
+ } else if (columns == 8) {
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 8, reference_frame_row_start, mv);
+ } else if (columns == 16) {
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 16, reference_frame_row_start, mv);
+ } else if (columns < 16) {
+ // This always true condition (columns < 16) may help the compiler
+ // simplify the inlining of the following function.
+ // This general case is rare and usually only happens to the blocks
+ // which contain the right boundary of the frame.
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ columns, reference_frame_row_start, mv);
+ } else {
+ assert(false);
+ }
+ return;
+ }
+ }
+}
+
+} // namespace libgav1
diff --git a/src/tile_scratch_buffer.cc b/src/tile_scratch_buffer.cc
new file mode 100644
index 0000000..0b5ac96
--- /dev/null
+++ b/src/tile_scratch_buffer.cc
@@ -0,0 +1,26 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile_scratch_buffer.h"
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+#if !LIBGAV1_CXX17
+// static
+constexpr int TileScratchBuffer::kBlockDecodedStride;
+#endif
+
+} // namespace libgav1
diff --git a/src/tile_scratch_buffer.h b/src/tile_scratch_buffer.h
new file mode 100644
index 0000000..3eaf8b8
--- /dev/null
+++ b/src/tile_scratch_buffer.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+
+#include <cstdint>
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/constants.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+
+// Buffer to facilitate decoding a superblock.
+struct TileScratchBuffer : public MaxAlignedAllocable {
+ static constexpr int kBlockDecodedStride = 34;
+
+ LIBGAV1_MUST_USE_RESULT bool Init(int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ const int pixel_size = (bitdepth == 8) ? 1 : 2;
+#else
+ assert(bitdepth == 8);
+ static_cast<void>(bitdepth);
+ const int pixel_size = 1;
+#endif
+
+ constexpr int unaligned_convolve_buffer_stride =
+ kMaxScaledSuperBlockSizeInPixels + kConvolveBorderLeftTop +
+ kConvolveBorderRight;
+ convolve_block_buffer_stride = Align<ptrdiff_t>(
+ unaligned_convolve_buffer_stride * pixel_size, kMaxAlignment);
+ constexpr int convolve_buffer_height = kMaxScaledSuperBlockSizeInPixels +
+ kConvolveBorderLeftTop +
+ kConvolveBorderBottom;
+
+ convolve_block_buffer = MakeAlignedUniquePtr<uint8_t>(
+ kMaxAlignment, convolve_buffer_height * convolve_block_buffer_stride);
+ return convolve_block_buffer != nullptr;
+ }
+
+ // kCompoundPredictionTypeDiffWeighted prediction mode needs a mask of the
+ // prediction block size. This buffer is used to store that mask. The masks
+ // will be created for the Y plane and will be re-used for the U & V planes.
+ alignas(kMaxAlignment) uint8_t weight_mask[kMaxSuperBlockSizeSquareInPixels];
+
+ // For each instance of the TileScratchBuffer, only one of the following
+ // buffers will be used at any given time, so it is ok to share them in a
+ // union.
+ union {
+ // Buffers used for prediction process.
+ // Compound prediction calculations always output 16-bit values. Depending
+ // on the bitdepth the values may be treated as int16_t or uint16_t. See
+ // src/dsp/convolve.cc and src/dsp/warp.cc for explanations.
+ // Inter/intra calculations output Pixel values.
+ // These buffers always use width as the stride. This enables packing the
+ // values in and simplifies loads/stores for small values.
+
+ // 10/12 bit compound prediction and 10/12 bit inter/intra prediction.
+ alignas(kMaxAlignment) uint16_t
+ prediction_buffer[2][kMaxSuperBlockSizeSquareInPixels];
+ // 8 bit compound prediction buffer.
+ alignas(kMaxAlignment) int16_t
+ compound_prediction_buffer_8bpp[2][kMaxSuperBlockSizeSquareInPixels];
+
+ // Union usage note: This is used only by functions in the "intra"
+ // prediction path.
+ //
+ // Buffer used for storing subsampled luma samples needed for CFL
+ // prediction. This buffer is used to avoid repetition of the subsampling
+ // for the V plane when it is already done for the U plane.
+ int16_t cfl_luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+ };
+
+ // Buffer used for convolve. The maximum size required for this buffer is:
+ // maximum block height (with scaling and border) = 2 * 128 + 3 + 4 = 263.
+ // maximum block stride (with scaling and border aligned to 16) =
+ // (2 * 128 + 3 + 8 + 5) * pixel_size = 272 * pixel_size.
+ // Where pixel_size is (bitdepth == 8) ? 1 : 2.
+ // Has an alignment of kMaxAlignment when allocated.
+ AlignedUniquePtr<uint8_t> convolve_block_buffer;
+ ptrdiff_t convolve_block_buffer_stride;
+
+ // Flag indicating whether the data in |cfl_luma_buffer| is valid.
+ bool cfl_luma_buffer_valid;
+
+ // Equivalent to BlockDecoded array in the spec. This stores the decoded
+ // state of every 4x4 block in a superblock. It has 1 row/column border on
+ // all 4 sides (hence the 34x34 dimension instead of 32x32). Note that the
+ // spec uses "-1" as an index to access the left and top borders. In the
+ // code, we treat the index (1, 1) as equivalent to the spec's (0, 0). So
+ // all accesses into this array will be offset by +1 when compared with the
+ // spec.
+ bool block_decoded[kMaxPlanes][kBlockDecodedStride][kBlockDecodedStride];
+};
+
+class TileScratchBufferPool {
+ public:
+ void Reset(int bitdepth) {
+ if (bitdepth_ == bitdepth) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ == 8 && bitdepth != 8) {
+ // We are going from a pixel size of 1 to a pixel size of 2. So invalidate
+ // the stack.
+ std::lock_guard<std::mutex> lock(mutex_);
+ while (!buffers_.Empty()) {
+ buffers_.Pop();
+ }
+ }
+#endif
+ bitdepth_ = bitdepth;
+ }
+
+ std::unique_ptr<TileScratchBuffer> Get() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (buffers_.Empty()) {
+ std::unique_ptr<TileScratchBuffer> scratch_buffer(new (std::nothrow)
+ TileScratchBuffer);
+ if (scratch_buffer == nullptr || !scratch_buffer->Init(bitdepth_)) {
+ return nullptr;
+ }
+ return scratch_buffer;
+ }
+ return buffers_.Pop();
+ }
+
+ void Release(std::unique_ptr<TileScratchBuffer> scratch_buffer) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ buffers_.Push(std::move(scratch_buffer));
+ }
+
+ private:
+ std::mutex mutex_;
+ // We will never need more than kMaxThreads scratch buffers since that is the
+ // maximum amount of work that will be done at any given time.
+ Stack<std::unique_ptr<TileScratchBuffer>, kMaxThreads> buffers_
+ LIBGAV1_GUARDED_BY(mutex_);
+ int bitdepth_ = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h
new file mode 100644
index 0000000..2df6241
--- /dev/null
+++ b/src/utils/array_2d.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+#define LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Exposes a 1D allocated memory buffer as a 2D array.
+template <typename T>
+class Array2DView {
+ public:
+ Array2DView() = default;
+ Array2DView(int rows, int columns, T* const data) {
+ Reset(rows, columns, data);
+ }
+
+ // Copyable and Movable.
+ Array2DView(const Array2DView& rhs) = default;
+ Array2DView& operator=(const Array2DView& rhs) = default;
+
+ void Reset(int rows, int columns, T* const data) {
+ rows_ = rows;
+ columns_ = columns;
+ data_ = data;
+ }
+
+ int rows() const { return rows_; }
+ int columns() const { return columns_; }
+
+ T* operator[](int row) { return const_cast<T*>(GetRow(row)); }
+
+ const T* operator[](int row) const { return GetRow(row); }
+
+ private:
+ const T* GetRow(int row) const {
+ assert(row < rows_);
+ const ptrdiff_t offset = static_cast<ptrdiff_t>(row) * columns_;
+ return data_ + offset;
+ }
+
+ int rows_ = 0;
+ int columns_ = 0;
+ T* data_ = nullptr;
+};
+
+// Allocates and owns the contiguous memory and exposes an Array2DView of
+// dimension |rows| x |columns|.
+template <typename T>
+class Array2D {
+ public:
+ Array2D() = default;
+
+ // Copyable and Movable.
+ Array2D(const Array2D& rhs) = default;
+ Array2D& operator=(const Array2D& rhs) = default;
+
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns,
+ bool zero_initialize = true) {
+ size_ = rows * columns;
+ // If T is not a trivial type, we should always reallocate the data_
+ // buffer, so that the destructors of any existing objects are invoked.
+ if (!std::is_trivial<T>::value || allocated_size_ < size_) {
+ // Note: This invokes the global operator new if T is a non-class type,
+ // such as integer or enum types, or a class type that is not derived
+ // from libgav1::Allocable, such as std::unique_ptr. If we enforce a
+ // maximum allocation size or keep track of our own heap memory
+ // consumption, we will need to handle the allocations here that use the
+ // global operator new.
+ if (zero_initialize) {
+ data_.reset(new (std::nothrow) T[size_]());
+ } else {
+ data_.reset(new (std::nothrow) T[size_]);
+ }
+ if (data_ == nullptr) {
+ allocated_size_ = 0;
+ return false;
+ }
+ allocated_size_ = size_;
+ } else if (zero_initialize) {
+ // Cast the data_ pointer to void* to avoid the GCC -Wclass-memaccess
+ // warning. The memset is safe because T is a trivial type.
+ void* dest = data_.get();
+ memset(dest, 0, sizeof(T) * size_);
+ }
+ data_view_.Reset(rows, columns, data_.get());
+ return true;
+ }
+
+ int rows() const { return data_view_.rows(); }
+ int columns() const { return data_view_.columns(); }
+ size_t size() const { return size_; }
+ T* data() { return data_.get(); }
+ const T* data() const { return data_.get(); }
+
+ T* operator[](int row) { return data_view_[row]; }
+
+ const T* operator[](int row) const { return data_view_[row]; }
+
+ private:
+ std::unique_ptr<T[]> data_ = nullptr;
+ size_t allocated_size_ = 0;
+ size_t size_ = 0;
+ Array2DView<T> data_view_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_ARRAY_2D_H_
diff --git a/src/utils/bit_mask_set.h b/src/utils/bit_mask_set.h
new file mode 100644
index 0000000..7371753
--- /dev/null
+++ b/src/utils/bit_mask_set.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+#define LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// This class is used to check if a given value is equal to one of the several
+// predetermined values using a bit mask instead of a chain of comparisons and
+// ||s. This usually results in fewer instructions.
+//
+// Usage:
+// constexpr BitMaskSet set(value1, value2);
+// set.Contains(value1) => returns true.
+// set.Contains(value3) => returns false.
+class BitMaskSet {
+ public:
+ explicit constexpr BitMaskSet(uint32_t mask) : mask_(mask) {}
+
+ constexpr BitMaskSet(int v1, int v2) : mask_((1U << v1) | (1U << v2)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+ (1U << v6)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+ (1U << v6) | (1U << v7)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+ int v8, int v9)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+ (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+ int v8, int v9, int v10)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+ (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9) | (1U << v10)) {
+ }
+
+ constexpr bool Contains(uint8_t value) const {
+ return MaskContainsValue(mask_, value);
+ }
+
+ static constexpr bool MaskContainsValue(uint32_t mask, uint8_t value) {
+ return ((mask >> value) & 1) != 0;
+ }
+
+ private:
+ const uint32_t mask_;
+};
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
diff --git a/src/utils/bit_reader.cc b/src/utils/bit_reader.cc
new file mode 100644
index 0000000..3234128
--- /dev/null
+++ b/src/utils/bit_reader.cc
@@ -0,0 +1,117 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/bit_reader.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+bool Assign(int* const value, int assignment, bool return_value) {
+ *value = assignment;
+ return return_value;
+}
+
+// 5.9.29.
+int InverseRecenter(int r, int v) {
+ if (v > (r << 1)) {
+ return v;
+ }
+ if ((v & 1) != 0) {
+ return r - ((v + 1) >> 1);
+ }
+ return r + (v >> 1);
+}
+
+} // namespace
+
+bool BitReader::DecodeSignedSubexpWithReference(int low, int high,
+ int reference, int control,
+ int* const value) {
+ if (!DecodeUnsignedSubexpWithReference(high - low, reference - low, control,
+ value)) {
+ return false;
+ }
+ *value += low;
+ return true;
+}
+
+bool BitReader::DecodeUniform(int n, int* const value) {
+ if (n <= 1) {
+ return Assign(value, 0, true);
+ }
+ const int w = FloorLog2(n) + 1;
+ const int m = (1 << w) - n;
+ assert(w - 1 < 32);
+ const int v = static_cast<int>(ReadLiteral(w - 1));
+ if (v == -1) {
+ return Assign(value, 0, false);
+ }
+ if (v < m) {
+ return Assign(value, v, true);
+ }
+ const int extra_bit = ReadBit();
+ if (extra_bit == -1) {
+ return Assign(value, 0, false);
+ }
+ return Assign(value, (v << 1) - m + extra_bit, true);
+}
+
+bool BitReader::DecodeUnsignedSubexpWithReference(int mx, int reference,
+ int control,
+ int* const value) {
+ int v;
+ if (!DecodeSubexp(mx, control, &v)) return false;
+ if ((reference << 1) <= mx) {
+ *value = InverseRecenter(reference, v);
+ } else {
+ *value = mx - 1 - InverseRecenter(mx - 1 - reference, v);
+ }
+ return true;
+}
+
+bool BitReader::DecodeSubexp(int num_symbols, int control, int* const value) {
+ int i = 0;
+ int mk = 0;
+ while (true) {
+ const int b = (i != 0) ? control + i - 1 : control;
+ if (b >= 32) {
+ return Assign(value, 0, false);
+ }
+ const int a = 1 << b;
+ if (num_symbols <= mk + 3 * a) {
+ if (!DecodeUniform(num_symbols - mk, value)) return false;
+ *value += mk;
+ return true;
+ }
+ const int8_t subexp_more_bits = ReadBit();
+ if (subexp_more_bits == -1) return false;
+ if (subexp_more_bits != 0) {
+ ++i;
+ mk += a;
+ } else {
+ const int subexp_bits = static_cast<int>(ReadLiteral(b));
+ if (subexp_bits == -1) {
+ return Assign(value, 0, false);
+ }
+ return Assign(value, subexp_bits + mk, true);
+ }
+ }
+}
+
+} // namespace libgav1
diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h
new file mode 100644
index 0000000..5a10e12
--- /dev/null
+++ b/src/utils/bit_reader.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_BIT_READER_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+class BitReader {
+ public:
+ virtual ~BitReader() = default;
+
+ virtual int ReadBit() = 0;
+ // |num_bits| has to be <= 32. The function returns a value in the range [0,
+ // 2^num_bits - 1] (inclusive) on success and -1 on failure.
+ virtual int64_t ReadLiteral(int num_bits) = 0;
+
+ bool DecodeSignedSubexpWithReference(int low, int high, int reference,
+ int control, int* value); // 5.9.26.
+ // Decodes a nonnegative integer with maximum number of values |n| (i.e.,
+ // output in range 0..n-1) by following the process specified in Section
+ // 4.10.7 ns(n) and Section 4.10.10 NS(n) of the spec.
+ bool DecodeUniform(int n, int* value);
+
+ private:
+ // Helper functions for DecodeSignedSubexpWithReference.
+ bool DecodeUnsignedSubexpWithReference(int mx, int reference, int control,
+ int* value); // 5.9.27.
+ bool DecodeSubexp(int num_symbols, int control, int* value); // 5.9.28.
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_BIT_READER_H_
diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc
new file mode 100644
index 0000000..3ccdb9b
--- /dev/null
+++ b/src/utils/block_parameters_holder.cc
@@ -0,0 +1,107 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/block_parameters_holder.h"
+
+#include <algorithm>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/parameter_tree.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+namespace {
+
+// Returns the number of super block rows/columns for |value4x4| where value4x4
+// is either rows4x4 or columns4x4.
+int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) {
+ return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127)
+ : DivideBy64(MultiplyBy4(value4x4) + 63);
+}
+
+} // namespace
+
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4,
+ bool use_128x128_superblock) {
+ rows4x4_ = rows4x4;
+ columns4x4_ = columns4x4;
+ use_128x128_superblock_ = use_128x128_superblock;
+ if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) {
+ LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed.");
+ return false;
+ }
+ const int rows =
+ RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_);
+ const int columns =
+ RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_);
+ const BlockSize sb_size =
+ use_128x128_superblock_ ? kBlock128x128 : kBlock64x64;
+ const int multiplier = kNum4x4BlocksWide[sb_size];
+ if (!trees_.Reset(rows, columns, /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR, "trees_.Reset() failed.");
+ return false;
+ }
+ for (int i = 0; i < rows; ++i) {
+ for (int j = 0; j < columns; ++j) {
+ trees_[i][j] =
+ ParameterTree::Create(i * multiplier, j * multiplier, sb_size);
+ if (trees_[i][j] == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j);
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+void BlockParametersHolder::FillCache(int row4x4, int column4x4,
+ BlockSize block_size,
+ BlockParameters* const bp) {
+ int rows = std::min(static_cast<int>(kNum4x4BlocksHigh[block_size]),
+ rows4x4_ - row4x4);
+ const int columns = std::min(static_cast<int>(kNum4x4BlocksWide[block_size]),
+ columns4x4_ - column4x4);
+ auto* bp_dst = &block_parameters_cache_[row4x4][column4x4];
+ // Specialize columns cases (values in kNum4x4BlocksWide[]) for better
+ // performance.
+ if (columns == 1) {
+ SetBlock<BlockParameters*>(rows, 1, bp, bp_dst, columns4x4_);
+ } else if (columns == 2) {
+ SetBlock<BlockParameters*>(rows, 2, bp, bp_dst, columns4x4_);
+ } else if (columns == 4) {
+ SetBlock<BlockParameters*>(rows, 4, bp, bp_dst, columns4x4_);
+ } else if (columns == 8) {
+ SetBlock<BlockParameters*>(rows, 8, bp, bp_dst, columns4x4_);
+ } else if (columns == 16) {
+ SetBlock<BlockParameters*>(rows, 16, bp, bp_dst, columns4x4_);
+ } else if (columns == 32) {
+ SetBlock<BlockParameters*>(rows, 32, bp, bp_dst, columns4x4_);
+ } else {
+ do {
+ // The following loop has better performance than using std::fill().
+ // std::fill() has some overhead in checking zero loop count.
+ int x = columns;
+ auto* d = bp_dst;
+ do {
+ *d++ = bp;
+ } while (--x != 0);
+ bp_dst += columns4x4_;
+ } while (--rows != 0);
+ }
+}
+
+} // namespace libgav1
diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h
new file mode 100644
index 0000000..35543c3
--- /dev/null
+++ b/src/utils/block_parameters_holder.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/parameter_tree.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters
+// corresponding to a superblock.
+class BlockParametersHolder {
+ public:
+ BlockParametersHolder() = default;
+
+ // Not copyable or movable.
+ BlockParametersHolder(const BlockParametersHolder&) = delete;
+ BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
+
+ // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
+ // otherwise 64x64 superblocks will be used.
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4,
+ bool use_128x128_superblock);
+
+ // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
+ // is done as a simple look up of the |block_parameters_cache_| matrix.
+ // Returns nullptr if the BlockParameters cannot be found.
+ BlockParameters* Find(int row4x4, int column4x4) const {
+ return block_parameters_cache_[row4x4][column4x4];
+ }
+
+ BlockParameters** Address(int row4x4, int column4x4) {
+ return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+ }
+
+ BlockParameters* const* Address(int row4x4, int column4x4) const {
+ return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+ }
+
+ int columns4x4() const { return columns4x4_; }
+
+ // Returns the ParameterTree corresponding to superblock starting at (|row|,
+ // |column|).
+ ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); }
+
+ // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of
+ // size |block_size| with the pointer |bp|.
+ void FillCache(int row4x4, int column4x4, BlockSize block_size,
+ BlockParameters* bp);
+
+ private:
+ int rows4x4_ = 0;
+ int columns4x4_ = 0;
+ bool use_128x128_superblock_ = false;
+ Array2D<std::unique_ptr<ParameterTree>> trees_;
+
+ // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
+ // FillCache() and used by Find() to perform look ups using exactly one look
+ // up (instead of traversing the entire tree).
+ Array2D<BlockParameters*> block_parameters_cache_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
diff --git a/src/utils/blocking_counter.h b/src/utils/blocking_counter.h
new file mode 100644
index 0000000..6d664f8
--- /dev/null
+++ b/src/utils/blocking_counter.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+#define LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+
+#include <cassert>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Implementation of a Blocking Counter that is used for the "fork-join"
+// use case. Typical usage would be as follows:
+// BlockingCounter counter(num_jobs);
+// - spawn the jobs.
+// - call counter.Wait() on the master thread.
+// - worker threads will call counter.Decrement().
+// - master thread will return from counter.Wait() when all workers are
+// complete.
+template <bool has_failure_status>
+class BlockingCounterImpl {
+ public:
+ explicit BlockingCounterImpl(int initial_count)
+ : count_(initial_count), job_failed_(false) {}
+
+ // Increment the counter by |count|. This must be called before Wait() is
+ // called. This must be called from the same thread that will call Wait().
+ void IncrementBy(int count) {
+ assert(count >= 0);
+ std::unique_lock<std::mutex> lock(mutex_);
+ count_ += count;
+ }
+
+ // Decrement the counter by 1. This function can be called only when
+ // |has_failure_status| is false (i.e.) when this class is being used with the
+ // |BlockingCounter| alias.
+ void Decrement() {
+ static_assert(!has_failure_status, "");
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (--count_ == 0) {
+ condition_.notify_one();
+ }
+ }
+
+ // Decrement the counter by 1. This function can be called only when
+ // |has_failure_status| is true (i.e.) when this class is being used with the
+ // |BlockingCounterWithStatus| alias. |job_succeeded| is used to update the
+ // state of |job_failed_|.
+ void Decrement(bool job_succeeded) {
+ static_assert(has_failure_status, "");
+ std::unique_lock<std::mutex> lock(mutex_);
+ job_failed_ |= !job_succeeded;
+ if (--count_ == 0) {
+ condition_.notify_one();
+ }
+ }
+
+ // Block until the counter becomes 0. This function can be called only once
+ // per object. If |has_failure_status| is true, true is returned if all the
+ // jobs succeeded and false is returned if any of the jobs failed. If
+ // |has_failure_status| is false, this function always returns true.
+ bool Wait() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ condition_.wait(lock, [this]() { return count_ == 0; });
+ // If |has_failure_status| is false, we simply return true.
+ return has_failure_status ? !job_failed_ : true;
+ }
+
+ private:
+ std::mutex mutex_;
+ std::condition_variable condition_;
+ int count_ LIBGAV1_GUARDED_BY(mutex_);
+ bool job_failed_ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+using BlockingCounterWithStatus = BlockingCounterImpl<true>;
+using BlockingCounter = BlockingCounterImpl<false>;
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
diff --git a/src/utils/common.h b/src/utils/common.h
new file mode 100644
index 0000000..ae43c2b
--- /dev/null
+++ b/src/utils/common.h
@@ -0,0 +1,534 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMMON_H_
+#define LIBGAV1_SRC_UTILS_COMMON_H_
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
+#pragma intrinsic(_BitScanReverse64)
+#define HAVE_BITSCANREVERSE64
+#endif // defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
+#endif // defined(_MSC_VER)
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <type_traits>
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Aligns |value| to the desired |alignment|. |alignment| must be a power of 2.
+template <typename T>
+inline T Align(T value, T alignment) {
+ assert(alignment != 0);
+ const T alignment_mask = alignment - 1;
+ return (value + alignment_mask) & ~alignment_mask;
+}
+
+// Aligns |addr| to the desired |alignment|. |alignment| must be a power of 2.
+inline uint8_t* AlignAddr(uint8_t* const addr, const uintptr_t alignment) {
+ const auto value = reinterpret_cast<uintptr_t>(addr);
+ return reinterpret_cast<uint8_t*>(Align(value, alignment));
+}
+
+inline int32_t Clip3(int32_t value, int32_t low, int32_t high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+template <typename Pixel>
+void ExtendLine(void* const line_start, const int width, const int left,
+ const int right) {
+ auto* const start = static_cast<Pixel*>(line_start);
+ const Pixel* src = start;
+ Pixel* dst = start - left;
+ // Copy to left and right borders.
+ Memset(dst, src[0], left);
+ Memset(dst + left + width, src[width - 1], right);
+}
+
+// The following 2 templates set a block of data with uncontiguous memory to
+// |value|. The compilers usually generate several branches to handle different
+// cases of |columns| when inlining memset() and std::fill(), and these branches
+// are unfortunately within the loop of |rows|. So calling these templates
+// directly could be inefficient. It is recommended to specialize common cases
+// of |columns|, such as 1, 2, 4, 8, 16 and 32, etc. in advance before
+// processing the generic case of |columns|. The code size may be larger, but
+// there would be big speed gains.
+// Call template MemSetBlock<> when sizeof(|T|) is 1.
+// Call template SetBlock<> when sizeof(|T|) is larger than 1.
+template <typename T>
+void MemSetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+ static_assert(sizeof(T) == 1, "");
+ do {
+ memset(dst, value, columns);
+ dst += stride;
+ } while (--rows != 0);
+}
+
+template <typename T>
+void SetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+ do {
+ std::fill(dst, dst + columns, value);
+ dst += stride;
+ } while (--rows != 0);
+}
+
+#if defined(__GNUC__)
+
+inline int CountLeadingZeros(uint32_t n) {
+ assert(n != 0);
+ return __builtin_clz(n);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+ assert(n != 0);
+ return __builtin_clzll(n);
+}
+
+inline int CountTrailingZeros(uint32_t n) {
+ assert(n != 0);
+ return __builtin_ctz(n);
+}
+
+#elif defined(_MSC_VER)
+
+inline int CountLeadingZeros(uint32_t n) {
+ assert(n != 0);
+ unsigned long first_set_bit; // NOLINT(runtime/int)
+ const unsigned char bit_set = _BitScanReverse(&first_set_bit, n);
+ assert(bit_set != 0);
+ static_cast<void>(bit_set);
+ return 31 ^ static_cast<int>(first_set_bit);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+ assert(n != 0);
+ unsigned long first_set_bit; // NOLINT(runtime/int)
+#if defined(HAVE_BITSCANREVERSE64)
+ const unsigned char bit_set =
+ _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
+#else // !defined(HAVE_BITSCANREVERSE64)
+ const auto n_hi = static_cast<unsigned long>(n >> 32); // NOLINT(runtime/int)
+ if (n_hi != 0) {
+ const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
+ assert(bit_set != 0);
+ static_cast<void>(bit_set);
+ return 31 ^ static_cast<int>(first_set_bit);
+ }
+ const unsigned char bit_set = _BitScanReverse(
+ &first_set_bit, static_cast<unsigned long>(n)); // NOLINT(runtime/int)
+#endif // defined(HAVE_BITSCANREVERSE64)
+ assert(bit_set != 0);
+ static_cast<void>(bit_set);
+ return 63 ^ static_cast<int>(first_set_bit);
+}
+
+#undef HAVE_BITSCANREVERSE64
+
+inline int CountTrailingZeros(uint32_t n) {
+ assert(n != 0);
+ unsigned long first_set_bit; // NOLINT(runtime/int)
+ const unsigned char bit_set = _BitScanForward(&first_set_bit, n);
+ assert(bit_set != 0);
+ static_cast<void>(bit_set);
+ return static_cast<int>(first_set_bit);
+}
+
+#else // !defined(__GNUC__) && !defined(_MSC_VER)
+
+template <const int kMSB, typename T>
+inline int CountLeadingZeros(T n) {
+ assert(n != 0);
+ const T msb = T{1} << kMSB;
+ int count = 0;
+ while ((n & msb) == 0) {
+ ++count;
+ n <<= 1;
+ }
+ return count;
+}
+
+inline int CountLeadingZeros(uint32_t n) { return CountLeadingZeros<31>(n); }
+
+inline int CountLeadingZeros(uint64_t n) { return CountLeadingZeros<63>(n); }
+
+// This is the algorithm on the left in Figure 5-23, Hacker's Delight, Second
+// Edition, page 109. The book says:
+// If the number of trailing 0's is expected to be small or large, then the
+// simple loops shown in Figure 5-23 are quite fast.
+inline int CountTrailingZeros(uint32_t n) {
+ assert(n != 0);
+ // Create a word with 1's at the positions of the trailing 0's in |n|, and
+ // 0's elsewhere (e.g., 01011000 => 00000111).
+ n = ~n & (n - 1);
+ int count = 0;
+ while (n != 0) {
+ ++count;
+ n >>= 1;
+ }
+ return count;
+}
+
+#endif // defined(__GNUC__)
+
+inline int FloorLog2(int32_t n) {
+ assert(n > 0);
+ return 31 ^ CountLeadingZeros(static_cast<uint32_t>(n));
+}
+
+inline int FloorLog2(uint32_t n) {
+ assert(n > 0);
+ return 31 ^ CountLeadingZeros(n);
+}
+
+inline int FloorLog2(int64_t n) {
+ assert(n > 0);
+ return 63 ^ CountLeadingZeros(static_cast<uint64_t>(n));
+}
+
+inline int FloorLog2(uint64_t n) {
+ assert(n > 0);
+ return 63 ^ CountLeadingZeros(n);
+}
+
+inline int CeilLog2(unsigned int n) {
+ // The expression FloorLog2(n - 1) + 1 is undefined not only for n == 0 but
+ // also for n == 1, so this expression must be guarded by the n < 2 test. An
+ // alternative implementation is:
+ // return (n == 0) ? 0 : FloorLog2(n) + static_cast<int>((n & (n - 1)) != 0);
+ return (n < 2) ? 0 : FloorLog2(n - 1) + 1;
+}
+
+inline int RightShiftWithCeiling(int value, int bits) {
+ assert(bits > 0);
+ return (value + (1 << bits) - 1) >> bits;
+}
+
+inline int32_t RightShiftWithRounding(int32_t value, int bits) {
+ assert(bits >= 0);
+ return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+inline uint32_t RightShiftWithRounding(uint32_t value, int bits) {
+ assert(bits >= 0);
+ return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRounding(int64_t value, int bits) {
+ assert(bits >= 0);
+ return static_cast<int32_t>((value + ((int64_t{1} << bits) >> 1)) >> bits);
+}
+
+inline int32_t RightShiftWithRoundingSigned(int32_t value, int bits) {
+ assert(bits > 0);
+ // The next line is equivalent to:
+ // return (value >= 0) ? RightShiftWithRounding(value, bits)
+ // : -RightShiftWithRounding(-value, bits);
+ return RightShiftWithRounding(value + (value >> 31), bits);
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRoundingSigned(int64_t value, int bits) {
+ assert(bits > 0);
+ // The next line is equivalent to:
+ // return (value >= 0) ? RightShiftWithRounding(value, bits)
+ // : -RightShiftWithRounding(-value, bits);
+ return RightShiftWithRounding(value + (value >> 63), bits);
+}
+
+constexpr int DivideBy2(int n) { return n >> 1; }
+constexpr int DivideBy4(int n) { return n >> 2; }
+constexpr int DivideBy8(int n) { return n >> 3; }
+constexpr int DivideBy16(int n) { return n >> 4; }
+constexpr int DivideBy32(int n) { return n >> 5; }
+constexpr int DivideBy64(int n) { return n >> 6; }
+constexpr int DivideBy128(int n) { return n >> 7; }
+
+// Convert |value| to unsigned before shifting to avoid undefined behavior with
+// negative values.
+inline int LeftShift(int value, int bits) {
+ assert(bits >= 0);
+ assert(value >= -(int64_t{1} << (31 - bits)));
+ assert(value <= (int64_t{1} << (31 - bits)) - ((bits == 0) ? 1 : 0));
+ return static_cast<int>(static_cast<uint32_t>(value) << bits);
+}
+inline int MultiplyBy2(int n) { return LeftShift(n, 1); }
+inline int MultiplyBy4(int n) { return LeftShift(n, 2); }
+inline int MultiplyBy8(int n) { return LeftShift(n, 3); }
+inline int MultiplyBy16(int n) { return LeftShift(n, 4); }
+inline int MultiplyBy32(int n) { return LeftShift(n, 5); }
+inline int MultiplyBy64(int n) { return LeftShift(n, 6); }
+
+constexpr int Mod32(int n) { return n & 0x1f; }
+constexpr int Mod64(int n) { return n & 0x3f; }
+
+//------------------------------------------------------------------------------
+// Bitstream functions
+
+constexpr bool IsIntraFrame(FrameType type) {
+ return type == kFrameKey || type == kFrameIntraOnly;
+}
+
+inline TransformClass GetTransformClass(TransformType tx_type) {
+ constexpr BitMaskSet kTransformClassVerticalMask(
+ kTransformTypeIdentityDct, kTransformTypeIdentityAdst,
+ kTransformTypeIdentityFlipadst);
+ if (kTransformClassVerticalMask.Contains(tx_type)) {
+ return kTransformClassVertical;
+ }
+ constexpr BitMaskSet kTransformClassHorizontalMask(
+ kTransformTypeDctIdentity, kTransformTypeAdstIdentity,
+ kTransformTypeFlipadstIdentity);
+ if (kTransformClassHorizontalMask.Contains(tx_type)) {
+ return kTransformClassHorizontal;
+ }
+ return kTransformClass2D;
+}
+
+inline int RowOrColumn4x4ToPixel(int row_or_column4x4, Plane plane,
+ int8_t subsampling) {
+ return MultiplyBy4(row_or_column4x4) >> (plane == kPlaneY ? 0 : subsampling);
+}
+
+constexpr PlaneType GetPlaneType(Plane plane) {
+ return static_cast<PlaneType>(plane != kPlaneY);
+}
+
+// 5.11.44.
+constexpr bool IsDirectionalMode(PredictionMode mode) {
+ return mode >= kPredictionModeVertical && mode <= kPredictionModeD67;
+}
+
+// 5.9.3.
+//
+// |a| and |b| are order hints, treated as unsigned order_hint_bits-bit
+// integers. |order_hint_shift_bits| equals (32 - order_hint_bits) % 32.
+// order_hint_bits is at most 8, so |order_hint_shift_bits| is zero or a
+// value between 24 and 31 (inclusive).
+//
+// If |order_hint_shift_bits| is zero, |a| and |b| are both zeros, and the
+// result is zero. If |order_hint_shift_bits| is not zero, returns the
+// signed difference |a| - |b| using "modular arithmetic". More precisely, the
+// signed difference |a| - |b| is treated as a signed order_hint_bits-bit
+// integer and cast to an int. The returned difference is between
+// -(1 << (order_hint_bits - 1)) and (1 << (order_hint_bits - 1)) - 1
+// (inclusive).
+//
+// NOTE: |a| and |b| are the order_hint_bits least significant bits of the
+// actual values. This function returns the signed difference between the
+// actual values. The returned difference is correct as long as the actual
+// values are not more than 1 << (order_hint_bits - 1) - 1 apart.
+//
+// Example: Suppose order_hint_bits is 4 and |order_hint_shift_bits|
+// is 28. Then |a| and |b| are in the range [0, 15], and the actual values for
+// |a| and |b| must not be more than 7 apart. (If the actual values for |a| and
+// |b| are exactly 8 apart, this function cannot tell whether the actual value
+// for |a| is before or after the actual value for |b|.)
+//
+// First, consider the order hints 2 and 6. For this simple case, we have
+// GetRelativeDistance(2, 6, 28) = 2 - 6 = -4, and
+// GetRelativeDistance(6, 2, 28) = 6 - 2 = 4.
+//
+// On the other hand, consider the order hints 2 and 14. The order hints are
+// 12 (> 7) apart, so we need to use the actual values instead. The actual
+// values may be 34 (= 2 mod 16) and 30 (= 14 mod 16), respectively. Therefore
+// we have
+// GetRelativeDistance(2, 14, 28) = 34 - 30 = 4, and
+// GetRelativeDistance(14, 2, 28) = 30 - 34 = -4.
+//
+// The following comments apply only to specific CPUs' SIMD implementations,
+// such as intrinsics code.
+// For the 2 shift operations in this function, if the SIMD packed data is
+// 16-bit wide, try to use |order_hint_shift_bits| - 16 as the number of bits to
+// shift; If the SIMD packed data is 8-bit wide, try to use
+// |order_hint_shift_bits| - 24 as as the number of bits to shift.
+// |order_hint_shift_bits| - 16 and |order_hint_shift_bits| - 24 could be -16 or
+// -24. In these cases diff is 0, and the behavior of left or right shifting -16
+// or -24 bits is defined for x86 SIMD instructions and ARM NEON instructions,
+// and the result of shifting 0 is still 0. There is no guarantee that this
+// behavior and result apply to other CPUs' SIMD instructions.
+inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
+ const unsigned int order_hint_shift_bits) {
+ const int diff = a - b;
+ assert(order_hint_shift_bits <= 31);
+ if (order_hint_shift_bits == 0) {
+ assert(a == 0);
+ assert(b == 0);
+ } else {
+ assert(order_hint_shift_bits >= 24); // i.e., order_hint_bits <= 8
+ assert(a < (1u << (32 - order_hint_shift_bits)));
+ assert(b < (1u << (32 - order_hint_shift_bits)));
+ assert(diff < (1 << (32 - order_hint_shift_bits)));
+ assert(diff >= -(1 << (32 - order_hint_shift_bits)));
+ }
+ // Sign extend the result of subtracting the values.
+ // Cast to unsigned int and then left shift to avoid undefined behavior with
+ // negative values. Cast to int to do the sign extension through right shift.
+ // This requires the right shift of a signed integer be an arithmetic shift,
+ // which is true for clang, gcc, and Visual C++.
+ // These two casts do not generate extra instructions.
+ // Don't use LeftShift(diff) since a valid diff may fail its assertions.
+ // For example, GetRelativeDistance(2, 14, 28), diff equals -12 and is less
+ // than the minimum allowed value of LeftShift() which is -8.
+ // The next 3 lines are equivalent to:
+ // const int order_hint_bits = Mod32(32 - order_hint_shift_bits);
+ // const int m = (1 << order_hint_bits) >> 1;
+ // return (diff & (m - 1)) - (diff & m);
+ return static_cast<int>(static_cast<unsigned int>(diff)
+ << order_hint_shift_bits) >>
+ order_hint_shift_bits;
+}
+
+// Applies |sign| (must be 0 or -1) to |value|, i.e.,
+// return (sign == 0) ? value : -value;
+// and does so without a branch.
+constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; }
+
+// 7.9.3. (without the clamp for numerator and denominator).
+inline void GetMvProjection(const MotionVector& mv, int numerator,
+ int division_multiplier,
+ MotionVector* projection_mv) {
+ // Allow numerator and to be 0 so that this function can be called
+ // unconditionally. When numerator is 0, |projection_mv| will be 0, and this
+ // is what we want.
+ assert(std::abs(numerator) <= kMaxFrameDistance);
+ for (int i = 0; i < 2; ++i) {
+ projection_mv->mv[i] =
+ Clip3(RightShiftWithRoundingSigned(
+ mv.mv[i] * numerator * division_multiplier, 14),
+ -kProjectionMvClamp, kProjectionMvClamp);
+ }
+}
+
+// 7.9.4.
+constexpr int Project(int value, int delta, int dst_sign) {
+ return value + ApplySign(delta / 64, dst_sign);
+}
+
+inline bool IsBlockSmallerThan8x8(BlockSize size) {
+ return size < kBlock8x8 && size != kBlock4x16;
+}
+
+// Returns true if the either the width or the height of the block is equal to
+// four.
+inline bool IsBlockDimension4(BlockSize size) {
+ return size < kBlock8x8 || size == kBlock16x4;
+}
+
+// Converts bitdepth 8, 10, and 12 to array index 0, 1, and 2, respectively.
+constexpr int BitdepthToArrayIndex(int bitdepth) { return (bitdepth - 8) >> 1; }
+
+// Maps a square transform to an index between [0, 4]. kTransformSize4x4 maps
+// to 0, kTransformSize8x8 maps to 1 and so on.
+inline int TransformSizeToSquareTransformIndex(TransformSize tx_size) {
+ assert(kTransformWidth[tx_size] == kTransformHeight[tx_size]);
+
+ // The values of the square transform sizes happen to be in the right
+ // ranges, so we can just divide them by 4 to get the indexes.
+ static_assert(
+ std::is_unsigned<std::underlying_type<TransformSize>::type>::value, "");
+ static_assert(kTransformSize4x4 < 4, "");
+ static_assert(4 <= kTransformSize8x8 && kTransformSize8x8 < 8, "");
+ static_assert(8 <= kTransformSize16x16 && kTransformSize16x16 < 12, "");
+ static_assert(12 <= kTransformSize32x32 && kTransformSize32x32 < 16, "");
+ static_assert(16 <= kTransformSize64x64 && kTransformSize64x64 < 20, "");
+ return DivideBy4(tx_size);
+}
+
+// Gets the corresponding Y/U/V position, to set and get filter masks
+// in deblock filtering.
+// Returns luma_position if it's Y plane, whose subsampling must be 0.
+// Returns the odd position for U/V plane, if there is subsampling.
+constexpr int GetDeblockPosition(const int luma_position,
+ const int subsampling) {
+ return luma_position | subsampling;
+}
+
+// Returns the size of the residual buffer required to hold the residual values
+// for a block or frame of size |rows| by |columns| (taking into account
+// |subsampling_x|, |subsampling_y| and |residual_size|). |residual_size| is the
+// number of bytes required to represent one residual value.
+inline size_t GetResidualBufferSize(const int rows, const int columns,
+ const int subsampling_x,
+ const int subsampling_y,
+ const size_t residual_size) {
+ // The subsampling multipliers are:
+ // Both x and y are subsampled: 3 / 2.
+ // Only x or y is subsampled: 2 / 1 (which is equivalent to 4 / 2).
+ // Both x and y are not subsampled: 3 / 1 (which is equivalent to 6 / 2).
+ // So we compute the final subsampling multiplier as follows:
+ // multiplier = (2 + (4 >> subsampling_x >> subsampling_y)) / 2.
+ // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary checks
+ // when parsing quantized coefficients.
+ const int subsampling_multiplier_num =
+ 2 + (4 >> subsampling_x >> subsampling_y);
+ const int number_elements =
+ (rows * columns * subsampling_multiplier_num) >> 1;
+ const int tx_padding = 32 * kResidualPaddingVertical;
+ return residual_size * (number_elements + tx_padding);
+}
+
+// This function is equivalent to:
+// std::min({kTransformWidthLog2[tx_size] - 2,
+// kTransformWidthLog2[left_tx_size] - 2,
+// 2});
+constexpr LoopFilterTransformSizeId GetTransformSizeIdWidth(
+ TransformSize tx_size, TransformSize left_tx_size) {
+ return static_cast<LoopFilterTransformSizeId>(
+ static_cast<int>(tx_size > kTransformSize4x16 &&
+ left_tx_size > kTransformSize4x16) +
+ static_cast<int>(tx_size > kTransformSize8x32 &&
+ left_tx_size > kTransformSize8x32));
+}
+
+// This is used for 7.11.3.4 Block Inter Prediction Process, to select convolve
+// filters.
+inline int GetFilterIndex(const int filter_index, const int length) {
+ if (length <= 4) {
+ if (filter_index == kInterpolationFilterEightTap ||
+ filter_index == kInterpolationFilterEightTapSharp) {
+ return 4;
+ }
+ if (filter_index == kInterpolationFilterEightTapSmooth) {
+ return 5;
+ }
+ }
+ return filter_index;
+}
+
+// This has identical results as RightShiftWithRounding since |subsampling| can
+// only be 0 or 1.
+constexpr int SubsampledValue(int value, int subsampling) {
+ return (value + subsampling) >> subsampling;
+}
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_COMMON_H_
diff --git a/src/utils/compiler_attributes.h b/src/utils/compiler_attributes.h
new file mode 100644
index 0000000..e122426
--- /dev/null
+++ b/src/utils/compiler_attributes.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+#define LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+
+// A collection of compiler attribute checks and defines to control for
+// compatibility across toolchains.
+
+//------------------------------------------------------------------------------
+// Language version, attribute and feature helpers.
+
+// Detect c++17 support. Visual Studio sets __cplusplus to 199711L by default
+// unless compiled with /Zc:__cplusplus, use the value controlled by /std
+// instead.
+// https://docs.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define LIBGAV1_CXX17 1
+#else
+#define LIBGAV1_CXX17 0
+#endif
+
+#if defined(__has_attribute)
+#define LIBGAV1_HAS_ATTRIBUTE __has_attribute
+#else
+#define LIBGAV1_HAS_ATTRIBUTE(x) 0
+#endif
+
+#if defined(__has_feature)
+#define LIBGAV1_HAS_FEATURE __has_feature
+#else
+#define LIBGAV1_HAS_FEATURE(x) 0
+#endif
+
+//------------------------------------------------------------------------------
+// Sanitizer attributes.
+
+#if LIBGAV1_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define LIBGAV1_ASAN 1
+#else
+#define LIBGAV1_ASAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(memory_sanitizer)
+#define LIBGAV1_MSAN 1
+#else
+#define LIBGAV1_MSAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(thread_sanitizer) || defined(__SANITIZE_THREAD__)
+#define LIBGAV1_TSAN 1
+#else
+#define LIBGAV1_TSAN 0
+#endif
+
+//------------------------------------------------------------------------------
+// AddressSanitizer support.
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if LIBGAV1_ASAN
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+ (static_cast<void>(addr), static_cast<void>(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+ (static_cast<void>(addr), static_cast<void>(size))
+#endif
+
+//------------------------------------------------------------------------------
+// Function attributes.
+// GCC: https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
+// Clang: https://clang.llvm.org/docs/AttributeReference.html
+
+#if defined(__GNUC__)
+#define LIBGAV1_ALWAYS_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define LIBGAV1_ALWAYS_INLINE __forceinline
+#else
+#define LIBGAV1_ALWAYS_INLINE inline
+#endif
+
+// LIBGAV1_MUST_USE_RESULT
+//
+// Tells the compiler to warn about unused results.
+//
+// When annotating a function, it must appear as the first part of the
+// declaration or definition. The compiler will warn if the return value from
+// such a function is unused:
+//
+// LIBGAV1_MUST_USE_RESULT Sprocket* AllocateSprocket();
+// AllocateSprocket(); // Triggers a warning.
+//
+// When annotating a class, it is equivalent to annotating every function which
+// returns an instance.
+//
+// class LIBGAV1_MUST_USE_RESULT Sprocket {};
+// Sprocket(); // Triggers a warning.
+//
+// Sprocket MakeSprocket();
+// MakeSprocket(); // Triggers a warning.
+//
+// Note that references and pointers are not instances:
+//
+// Sprocket* SprocketPointer();
+// SprocketPointer(); // Does *not* trigger a warning.
+//
+// LIBGAV1_MUST_USE_RESULT allows using cast-to-void to suppress the unused
+// result warning. For that, warn_unused_result is used only for clang but not
+// for gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425
+#if LIBGAV1_HAS_ATTRIBUTE(nodiscard)
+#define LIBGAV1_MUST_USE_RESULT [[nodiscard]]
+#elif defined(__clang__) && LIBGAV1_HAS_ATTRIBUTE(warn_unused_result)
+#define LIBGAV1_MUST_USE_RESULT __attribute__((warn_unused_result))
+#else
+#define LIBGAV1_MUST_USE_RESULT
+#endif
+
+// LIBGAV1_PRINTF_ATTRIBUTE
+//
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+//
+// Note: As the GCC manual states, "[s]ince non-static C++ methods
+// have an implicit 'this' argument, the arguments of such methods
+// should be counted from two, not one."
+#if LIBGAV1_HAS_ATTRIBUTE(format) || (defined(__GNUC__) && !defined(__clang__))
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) \
+ __attribute__((__format__(__printf__, string_index, first_to_check)))
+#else
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check)
+#endif
+
+//------------------------------------------------------------------------------
+// Thread annotations.
+
+// LIBGAV1_GUARDED_BY()
+//
+// Documents if a shared field or global variable needs to be protected by a
+// mutex. LIBGAV1_GUARDED_BY() allows the user to specify a particular mutex
+// that should be held when accessing the annotated variable.
+//
+// Although this annotation cannot be applied to local variables, a local
+// variable and its associated mutex can often be combined into a small class
+// or struct, thereby allowing the annotation.
+//
+// Example:
+//
+// class Foo {
+// Mutex mu_;
+// int p1_ LIBGAV1_GUARDED_BY(mu_);
+// ...
+// };
+// TODO(b/132506370): this can be reenabled after a local MutexLock
+// implementation is added with proper thread annotations.
+#if 0 // LIBGAV1_HAS_ATTRIBUTE(guarded_by)
+#define LIBGAV1_GUARDED_BY(x) __attribute__((guarded_by(x)))
+#else
+#define LIBGAV1_GUARDED_BY(x)
+#endif
+
+//------------------------------------------------------------------------------
+
+#undef LIBGAV1_HAS_ATTRIBUTE
+#undef LIBGAV1_HAS_FEATURE
+
+#endif // LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
diff --git a/src/utils/constants.cc b/src/utils/constants.cc
new file mode 100644
index 0000000..80d7acb
--- /dev/null
+++ b/src/utils/constants.cc
@@ -0,0 +1,874 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+const uint8_t k4x4WidthLog2[kMaxBlockSizes] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+ 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5};
+
+const uint8_t k4x4HeightLog2[kMaxBlockSizes] = {
+ 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 5, 4, 5};
+
+const uint8_t kNum4x4BlocksWide[kMaxBlockSizes] = {
+ 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32};
+
+const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes] = {
+ 1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16, 32, 16, 32};
+
+const uint8_t kBlockWidthPixels[kMaxBlockSizes] = {
+ 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16,
+ 16, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128};
+
+const uint8_t kBlockHeightPixels[kMaxBlockSizes] = {
+ 4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32,
+ 64, 8, 16, 32, 64, 16, 32, 64, 128, 64, 128};
+
+// 9.3 -- Partition_Subsize[]
+const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes] = {
+ // kPartitionNone
+ {kBlock4x4, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x64, kBlockInvalid,
+ kBlockInvalid, kBlock128x128},
+ // kPartitionHorizontal
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid,
+ kBlockInvalid, kBlock128x64},
+ // kPartitionVertical
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid,
+ kBlockInvalid, kBlock64x128},
+ // kPartitionSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32, kBlockInvalid,
+ kBlockInvalid, kBlock64x64},
+ // kPartitionHorizontalWithTopSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid,
+ kBlockInvalid, kBlock128x64},
+ // kPartitionHorizontalWithBottomSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid,
+ kBlockInvalid, kBlock128x64},
+ // kPartitionVerticalWithLeftSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid,
+ kBlockInvalid, kBlock64x128},
+ // kPartitionVerticalWithRightSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid,
+ kBlockInvalid, kBlock64x128},
+ // kPartitionHorizontal4
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x16, kBlockInvalid,
+ kBlockInvalid, kBlockInvalid},
+ // kPartitionVertical4
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x64, kBlockInvalid,
+ kBlockInvalid, kBlockInvalid}};
+
+// 5.11.38 (implemented as a simple look up. first dimension is block size,
+// second and third are subsampling_x and subsampling_y).
+const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2] = {
+ {{kBlock4x4, kBlock4x4}, {kBlock4x4, kBlock4x4}},
+ {{kBlock4x8, kBlock4x4}, {kBlockInvalid, kBlock4x4}},
+ {{kBlock4x16, kBlock4x8}, {kBlockInvalid, kBlock4x8}},
+ {{kBlock8x4, kBlockInvalid}, {kBlock4x4, kBlock4x4}},
+ {{kBlock8x8, kBlock8x4}, {kBlock4x8, kBlock4x4}},
+ {{kBlock8x16, kBlock8x8}, {kBlockInvalid, kBlock4x8}},
+ {{kBlock8x32, kBlock8x16}, {kBlockInvalid, kBlock4x16}},
+ {{kBlock16x4, kBlockInvalid}, {kBlock8x4, kBlock8x4}},
+ {{kBlock16x8, kBlockInvalid}, {kBlock8x8, kBlock8x4}},
+ {{kBlock16x16, kBlock16x8}, {kBlock8x16, kBlock8x8}},
+ {{kBlock16x32, kBlock16x16}, {kBlockInvalid, kBlock8x16}},
+ {{kBlock16x64, kBlock16x32}, {kBlockInvalid, kBlock8x32}},
+ {{kBlock32x8, kBlockInvalid}, {kBlock16x8, kBlock16x4}},
+ {{kBlock32x16, kBlockInvalid}, {kBlock16x16, kBlock16x8}},
+ {{kBlock32x32, kBlock32x16}, {kBlock16x32, kBlock16x16}},
+ {{kBlock32x64, kBlock32x32}, {kBlockInvalid, kBlock16x32}},
+ {{kBlock64x16, kBlockInvalid}, {kBlock32x16, kBlock32x8}},
+ {{kBlock64x32, kBlockInvalid}, {kBlock32x32, kBlock32x16}},
+ {{kBlock64x64, kBlock64x32}, {kBlock32x64, kBlock32x32}},
+ {{kBlock64x128, kBlock64x64}, {kBlockInvalid, kBlock32x64}},
+ {{kBlock128x64, kBlockInvalid}, {kBlock64x64, kBlock64x32}},
+ {{kBlock128x128, kBlock128x64}, {kBlock64x128, kBlock64x64}}};
+
+const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+ 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780,
+ 744, 712, 682, 655, 630, 606, 585, 564, 546, 528};
+
+const uint8_t kTransformWidth[kNumTransformSizes] = {
+ 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 32, 32, 32, 32, 64, 64, 64};
+
+const uint8_t kTransformHeight[kNumTransformSizes] = {
+ 4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, 64, 8, 16, 32, 64, 16, 32, 64};
+
+const uint8_t kTransformWidth4x4[kNumTransformSizes] = {
+ 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16};
+
+const uint8_t kTransformHeight4x4[kNumTransformSizes] = {
+ 1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16};
+
+const uint8_t kTransformWidthLog2[kNumTransformSizes] = {
+ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6};
+
+const uint8_t kTransformHeightLog2[kNumTransformSizes] = {
+ 2, 3, 4, 2, 3, 4, 5, 2, 3, 4, 5, 6, 3, 4, 5, 6, 4, 5, 6};
+
+// 9.3 -- Split_Tx_Size[]
+const TransformSize kSplitTransformSize[kNumTransformSizes] = {
+ kTransformSize4x4, kTransformSize4x4, kTransformSize4x8,
+ kTransformSize4x4, kTransformSize4x4, kTransformSize8x8,
+ kTransformSize8x16, kTransformSize8x4, kTransformSize8x8,
+ kTransformSize8x8, kTransformSize16x16, kTransformSize16x32,
+ kTransformSize16x8, kTransformSize16x16, kTransformSize16x16,
+ kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32};
+
+// Square transform of size min(w,h).
+const TransformSize kTransformSizeSquareMin[kNumTransformSizes] = {
+ kTransformSize4x4, kTransformSize4x4, kTransformSize4x4,
+ kTransformSize4x4, kTransformSize8x8, kTransformSize8x8,
+ kTransformSize8x8, kTransformSize4x4, kTransformSize8x8,
+ kTransformSize16x16, kTransformSize16x16, kTransformSize16x16,
+ kTransformSize8x8, kTransformSize16x16, kTransformSize32x32,
+ kTransformSize32x32, kTransformSize16x16, kTransformSize32x32,
+ kTransformSize64x64};
+
+// Square transform of size max(w,h).
+const TransformSize kTransformSizeSquareMax[kNumTransformSizes] = {
+ kTransformSize4x4, kTransformSize8x8, kTransformSize16x16,
+ kTransformSize8x8, kTransformSize8x8, kTransformSize16x16,
+ kTransformSize32x32, kTransformSize16x16, kTransformSize16x16,
+ kTransformSize16x16, kTransformSize32x32, kTransformSize64x64,
+ kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+ kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+ kTransformSize64x64};
+
+const uint8_t kNumTransformTypesInSet[kNumTransformSets] = {1, 7, 5, 16, 12, 2};
+
+const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4] = {
+ {2, 12, 1, 4}, {2, 15, 1, 6}, {2, 18, 1, 8}, {2, 21, 1, 9},
+ {2, 24, 1, 10}, {2, 29, 1, 11}, {2, 36, 1, 12}, {2, 45, 1, 13},
+ {2, 56, 1, 14}, {2, 68, 1, 15}, {0, 0, 1, 5}, {0, 0, 1, 8},
+ {0, 0, 1, 11}, {0, 0, 1, 14}, {2, 30, 0, 0}, {2, 75, 0, 0}};
+
+const int8_t kSgrProjMultiplierMin[2] = {-96, -32};
+
+const int8_t kSgrProjMultiplierMax[2] = {31, 95};
+
+const int8_t kWienerTapsMin[3] = {-5, -23, -17};
+
+const int8_t kWienerTapsMax[3] = {10, 8, 46};
+
+// This was modified from Upscale_Filter as defined in AV1 Section 7.16, in
+// order to support 16-bit packed NEON operations.
+// The sign of each tap is: - + - + + - + -
+alignas(16) const uint8_t
+ kUpscaleFilterUnsigned[kSuperResFilterShifts][kSuperResFilterTaps] = {
+ {0, 0, 0, 128, 0, 0, 0, 0}, {0, 0, 1, 128, 2, 1, 0, 0},
+ {0, 1, 3, 127, 4, 2, 1, 0}, {0, 1, 4, 127, 6, 3, 1, 0},
+ {0, 2, 6, 126, 8, 3, 1, 0}, {0, 2, 7, 125, 11, 4, 1, 0},
+ {1, 2, 8, 125, 13, 5, 2, 0}, {1, 3, 9, 124, 15, 6, 2, 0},
+ {1, 3, 10, 123, 18, 6, 2, 1}, {1, 3, 11, 122, 20, 7, 3, 1},
+ {1, 4, 12, 121, 22, 8, 3, 1}, {1, 4, 13, 120, 25, 9, 3, 1},
+ {1, 4, 14, 118, 28, 9, 3, 1}, {1, 4, 15, 117, 30, 10, 4, 1},
+ {1, 5, 16, 116, 32, 11, 4, 1}, {1, 5, 16, 114, 35, 12, 4, 1},
+ {1, 5, 17, 112, 38, 12, 4, 1}, {1, 5, 18, 111, 40, 13, 5, 1},
+ {1, 5, 18, 109, 43, 14, 5, 1}, {1, 6, 19, 107, 45, 14, 5, 1},
+ {1, 6, 19, 105, 48, 15, 5, 1}, {1, 6, 19, 103, 51, 16, 5, 1},
+ {1, 6, 20, 101, 53, 16, 6, 1}, {1, 6, 20, 99, 56, 17, 6, 1},
+ {1, 6, 20, 97, 58, 17, 6, 1}, {1, 6, 20, 95, 61, 18, 6, 1},
+ {2, 7, 20, 93, 64, 18, 6, 2}, {2, 7, 20, 91, 66, 19, 6, 1},
+ {2, 7, 20, 88, 69, 19, 6, 1}, {2, 7, 20, 86, 71, 19, 6, 1},
+ {2, 7, 20, 84, 74, 20, 7, 2}, {2, 7, 20, 81, 76, 20, 7, 1},
+ {2, 7, 20, 79, 79, 20, 7, 2}, {1, 7, 20, 76, 81, 20, 7, 2},
+ {2, 7, 20, 74, 84, 20, 7, 2}, {1, 6, 19, 71, 86, 20, 7, 2},
+ {1, 6, 19, 69, 88, 20, 7, 2}, {1, 6, 19, 66, 91, 20, 7, 2},
+ {2, 6, 18, 64, 93, 20, 7, 2}, {1, 6, 18, 61, 95, 20, 6, 1},
+ {1, 6, 17, 58, 97, 20, 6, 1}, {1, 6, 17, 56, 99, 20, 6, 1},
+ {1, 6, 16, 53, 101, 20, 6, 1}, {1, 5, 16, 51, 103, 19, 6, 1},
+ {1, 5, 15, 48, 105, 19, 6, 1}, {1, 5, 14, 45, 107, 19, 6, 1},
+ {1, 5, 14, 43, 109, 18, 5, 1}, {1, 5, 13, 40, 111, 18, 5, 1},
+ {1, 4, 12, 38, 112, 17, 5, 1}, {1, 4, 12, 35, 114, 16, 5, 1},
+ {1, 4, 11, 32, 116, 16, 5, 1}, {1, 4, 10, 30, 117, 15, 4, 1},
+ {1, 3, 9, 28, 118, 14, 4, 1}, {1, 3, 9, 25, 120, 13, 4, 1},
+ {1, 3, 8, 22, 121, 12, 4, 1}, {1, 3, 7, 20, 122, 11, 3, 1},
+ {1, 2, 6, 18, 123, 10, 3, 1}, {0, 2, 6, 15, 124, 9, 3, 1},
+ {0, 2, 5, 13, 125, 8, 2, 1}, {0, 1, 4, 11, 125, 7, 2, 0},
+ {0, 1, 3, 8, 126, 6, 2, 0}, {0, 1, 3, 6, 127, 4, 1, 0},
+ {0, 1, 2, 4, 127, 3, 1, 0}, {0, 0, 1, 2, 128, 1, 0, 0},
+};
+
+alignas(8) const int8_t
+ kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+ // [-1, 0).
+ {0, 0, 127, 1, 0, 0, 0, 0},
+ {0, -1, 127, 2, 0, 0, 0, 0},
+ {1, -3, 127, 4, -1, 0, 0, 0},
+ {1, -4, 126, 6, -2, 1, 0, 0},
+ {1, -5, 126, 8, -3, 1, 0, 0},
+ {1, -6, 125, 11, -4, 1, 0, 0},
+ {1, -7, 124, 13, -4, 1, 0, 0},
+ {2, -8, 123, 15, -5, 1, 0, 0},
+ {2, -9, 122, 18, -6, 1, 0, 0},
+ {2, -10, 121, 20, -6, 1, 0, 0},
+ {2, -11, 120, 22, -7, 2, 0, 0},
+ {2, -12, 119, 25, -8, 2, 0, 0},
+ {3, -13, 117, 27, -8, 2, 0, 0},
+ {3, -13, 116, 29, -9, 2, 0, 0},
+ {3, -14, 114, 32, -10, 3, 0, 0},
+ {3, -15, 113, 35, -10, 2, 0, 0},
+ {3, -15, 111, 37, -11, 3, 0, 0},
+ {3, -16, 109, 40, -11, 3, 0, 0},
+ {3, -16, 108, 42, -12, 3, 0, 0},
+ {4, -17, 106, 45, -13, 3, 0, 0},
+ {4, -17, 104, 47, -13, 3, 0, 0},
+ {4, -17, 102, 50, -14, 3, 0, 0},
+ {4, -17, 100, 52, -14, 3, 0, 0},
+ {4, -18, 98, 55, -15, 4, 0, 0},
+ {4, -18, 96, 58, -15, 3, 0, 0},
+ {4, -18, 94, 60, -16, 4, 0, 0},
+ {4, -18, 91, 63, -16, 4, 0, 0},
+ {4, -18, 89, 65, -16, 4, 0, 0},
+ {4, -18, 87, 68, -17, 4, 0, 0},
+ {4, -18, 85, 70, -17, 4, 0, 0},
+ {4, -18, 82, 73, -17, 4, 0, 0},
+ {4, -18, 80, 75, -17, 4, 0, 0},
+ {4, -18, 78, 78, -18, 4, 0, 0},
+ {4, -17, 75, 80, -18, 4, 0, 0},
+ {4, -17, 73, 82, -18, 4, 0, 0},
+ {4, -17, 70, 85, -18, 4, 0, 0},
+ {4, -17, 68, 87, -18, 4, 0, 0},
+ {4, -16, 65, 89, -18, 4, 0, 0},
+ {4, -16, 63, 91, -18, 4, 0, 0},
+ {4, -16, 60, 94, -18, 4, 0, 0},
+ {3, -15, 58, 96, -18, 4, 0, 0},
+ {4, -15, 55, 98, -18, 4, 0, 0},
+ {3, -14, 52, 100, -17, 4, 0, 0},
+ {3, -14, 50, 102, -17, 4, 0, 0},
+ {3, -13, 47, 104, -17, 4, 0, 0},
+ {3, -13, 45, 106, -17, 4, 0, 0},
+ {3, -12, 42, 108, -16, 3, 0, 0},
+ {3, -11, 40, 109, -16, 3, 0, 0},
+ {3, -11, 37, 111, -15, 3, 0, 0},
+ {2, -10, 35, 113, -15, 3, 0, 0},
+ {3, -10, 32, 114, -14, 3, 0, 0},
+ {2, -9, 29, 116, -13, 3, 0, 0},
+ {2, -8, 27, 117, -13, 3, 0, 0},
+ {2, -8, 25, 119, -12, 2, 0, 0},
+ {2, -7, 22, 120, -11, 2, 0, 0},
+ {1, -6, 20, 121, -10, 2, 0, 0},
+ {1, -6, 18, 122, -9, 2, 0, 0},
+ {1, -5, 15, 123, -8, 2, 0, 0},
+ {1, -4, 13, 124, -7, 1, 0, 0},
+ {1, -4, 11, 125, -6, 1, 0, 0},
+ {1, -3, 8, 126, -5, 1, 0, 0},
+ {1, -2, 6, 126, -4, 1, 0, 0},
+ {0, -1, 4, 127, -3, 1, 0, 0},
+ {0, 0, 2, 127, -1, 0, 0, 0},
+ // [0, 1).
+ {0, 0, 0, 127, 1, 0, 0, 0},
+ {0, 0, -1, 127, 2, 0, 0, 0},
+ {0, 1, -3, 127, 4, -2, 1, 0},
+ {0, 1, -5, 127, 6, -2, 1, 0},
+ {0, 2, -6, 126, 8, -3, 1, 0},
+ {-1, 2, -7, 126, 11, -4, 2, -1},
+ {-1, 3, -8, 125, 13, -5, 2, -1},
+ {-1, 3, -10, 124, 16, -6, 3, -1},
+ {-1, 4, -11, 123, 18, -7, 3, -1},
+ {-1, 4, -12, 122, 20, -7, 3, -1},
+ {-1, 4, -13, 121, 23, -8, 3, -1},
+ {-2, 5, -14, 120, 25, -9, 4, -1},
+ {-1, 5, -15, 119, 27, -10, 4, -1},
+ {-1, 5, -16, 118, 30, -11, 4, -1},
+ {-2, 6, -17, 116, 33, -12, 5, -1},
+ {-2, 6, -17, 114, 35, -12, 5, -1},
+ {-2, 6, -18, 113, 38, -13, 5, -1},
+ {-2, 7, -19, 111, 41, -14, 6, -2},
+ {-2, 7, -19, 110, 43, -15, 6, -2},
+ {-2, 7, -20, 108, 46, -15, 6, -2},
+ {-2, 7, -20, 106, 49, -16, 6, -2},
+ {-2, 7, -21, 104, 51, -16, 7, -2},
+ {-2, 7, -21, 102, 54, -17, 7, -2},
+ {-2, 8, -21, 100, 56, -18, 7, -2},
+ {-2, 8, -22, 98, 59, -18, 7, -2},
+ {-2, 8, -22, 96, 62, -19, 7, -2},
+ {-2, 8, -22, 94, 64, -19, 7, -2},
+ {-2, 8, -22, 91, 67, -20, 8, -2},
+ {-2, 8, -22, 89, 69, -20, 8, -2},
+ {-2, 8, -22, 87, 72, -21, 8, -2},
+ {-2, 8, -21, 84, 74, -21, 8, -2},
+ {-2, 8, -22, 82, 77, -21, 8, -2},
+ {-2, 8, -21, 79, 79, -21, 8, -2},
+ {-2, 8, -21, 77, 82, -22, 8, -2},
+ {-2, 8, -21, 74, 84, -21, 8, -2},
+ {-2, 8, -21, 72, 87, -22, 8, -2},
+ {-2, 8, -20, 69, 89, -22, 8, -2},
+ {-2, 8, -20, 67, 91, -22, 8, -2},
+ {-2, 7, -19, 64, 94, -22, 8, -2},
+ {-2, 7, -19, 62, 96, -22, 8, -2},
+ {-2, 7, -18, 59, 98, -22, 8, -2},
+ {-2, 7, -18, 56, 100, -21, 8, -2},
+ {-2, 7, -17, 54, 102, -21, 7, -2},
+ {-2, 7, -16, 51, 104, -21, 7, -2},
+ {-2, 6, -16, 49, 106, -20, 7, -2},
+ {-2, 6, -15, 46, 108, -20, 7, -2},
+ {-2, 6, -15, 43, 110, -19, 7, -2},
+ {-2, 6, -14, 41, 111, -19, 7, -2},
+ {-1, 5, -13, 38, 113, -18, 6, -2},
+ {-1, 5, -12, 35, 114, -17, 6, -2},
+ {-1, 5, -12, 33, 116, -17, 6, -2},
+ {-1, 4, -11, 30, 118, -16, 5, -1},
+ {-1, 4, -10, 27, 119, -15, 5, -1},
+ {-1, 4, -9, 25, 120, -14, 5, -2},
+ {-1, 3, -8, 23, 121, -13, 4, -1},
+ {-1, 3, -7, 20, 122, -12, 4, -1},
+ {-1, 3, -7, 18, 123, -11, 4, -1},
+ {-1, 3, -6, 16, 124, -10, 3, -1},
+ {-1, 2, -5, 13, 125, -8, 3, -1},
+ {-1, 2, -4, 11, 126, -7, 2, -1},
+ {0, 1, -3, 8, 126, -6, 2, 0},
+ {0, 1, -2, 6, 127, -5, 1, 0},
+ {0, 1, -2, 4, 127, -3, 1, 0},
+ {0, 0, 0, 2, 127, -1, 0, 0},
+ // [1, 2).
+ {0, 0, 0, 1, 127, 0, 0, 0},
+ {0, 0, 0, -1, 127, 2, 0, 0},
+ {0, 0, 1, -3, 127, 4, -1, 0},
+ {0, 0, 1, -4, 126, 6, -2, 1},
+ {0, 0, 1, -5, 126, 8, -3, 1},
+ {0, 0, 1, -6, 125, 11, -4, 1},
+ {0, 0, 1, -7, 124, 13, -4, 1},
+ {0, 0, 2, -8, 123, 15, -5, 1},
+ {0, 0, 2, -9, 122, 18, -6, 1},
+ {0, 0, 2, -10, 121, 20, -6, 1},
+ {0, 0, 2, -11, 120, 22, -7, 2},
+ {0, 0, 2, -12, 119, 25, -8, 2},
+ {0, 0, 3, -13, 117, 27, -8, 2},
+ {0, 0, 3, -13, 116, 29, -9, 2},
+ {0, 0, 3, -14, 114, 32, -10, 3},
+ {0, 0, 3, -15, 113, 35, -10, 2},
+ {0, 0, 3, -15, 111, 37, -11, 3},
+ {0, 0, 3, -16, 109, 40, -11, 3},
+ {0, 0, 3, -16, 108, 42, -12, 3},
+ {0, 0, 4, -17, 106, 45, -13, 3},
+ {0, 0, 4, -17, 104, 47, -13, 3},
+ {0, 0, 4, -17, 102, 50, -14, 3},
+ {0, 0, 4, -17, 100, 52, -14, 3},
+ {0, 0, 4, -18, 98, 55, -15, 4},
+ {0, 0, 4, -18, 96, 58, -15, 3},
+ {0, 0, 4, -18, 94, 60, -16, 4},
+ {0, 0, 4, -18, 91, 63, -16, 4},
+ {0, 0, 4, -18, 89, 65, -16, 4},
+ {0, 0, 4, -18, 87, 68, -17, 4},
+ {0, 0, 4, -18, 85, 70, -17, 4},
+ {0, 0, 4, -18, 82, 73, -17, 4},
+ {0, 0, 4, -18, 80, 75, -17, 4},
+ {0, 0, 4, -18, 78, 78, -18, 4},
+ {0, 0, 4, -17, 75, 80, -18, 4},
+ {0, 0, 4, -17, 73, 82, -18, 4},
+ {0, 0, 4, -17, 70, 85, -18, 4},
+ {0, 0, 4, -17, 68, 87, -18, 4},
+ {0, 0, 4, -16, 65, 89, -18, 4},
+ {0, 0, 4, -16, 63, 91, -18, 4},
+ {0, 0, 4, -16, 60, 94, -18, 4},
+ {0, 0, 3, -15, 58, 96, -18, 4},
+ {0, 0, 4, -15, 55, 98, -18, 4},
+ {0, 0, 3, -14, 52, 100, -17, 4},
+ {0, 0, 3, -14, 50, 102, -17, 4},
+ {0, 0, 3, -13, 47, 104, -17, 4},
+ {0, 0, 3, -13, 45, 106, -17, 4},
+ {0, 0, 3, -12, 42, 108, -16, 3},
+ {0, 0, 3, -11, 40, 109, -16, 3},
+ {0, 0, 3, -11, 37, 111, -15, 3},
+ {0, 0, 2, -10, 35, 113, -15, 3},
+ {0, 0, 3, -10, 32, 114, -14, 3},
+ {0, 0, 2, -9, 29, 116, -13, 3},
+ {0, 0, 2, -8, 27, 117, -13, 3},
+ {0, 0, 2, -8, 25, 119, -12, 2},
+ {0, 0, 2, -7, 22, 120, -11, 2},
+ {0, 0, 1, -6, 20, 121, -10, 2},
+ {0, 0, 1, -6, 18, 122, -9, 2},
+ {0, 0, 1, -5, 15, 123, -8, 2},
+ {0, 0, 1, -4, 13, 124, -7, 1},
+ {0, 0, 1, -4, 11, 125, -6, 1},
+ {0, 0, 1, -3, 8, 126, -5, 1},
+ {0, 0, 1, -2, 6, 126, -4, 1},
+ {0, 0, 0, -1, 4, 127, -3, 1},
+ {0, 0, 0, 0, 2, 127, -1, 0},
+ // dummy, replicate row index 191.
+ {0, 0, 0, 0, 2, 127, -1, 0}};
+
+alignas(16) const int16_t
+ kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+ // [-1, 0).
+ {0, 0, 127, 1, 0, 0, 0, 0},
+ {0, -1, 127, 2, 0, 0, 0, 0},
+ {1, -3, 127, 4, -1, 0, 0, 0},
+ {1, -4, 126, 6, -2, 1, 0, 0},
+ {1, -5, 126, 8, -3, 1, 0, 0},
+ {1, -6, 125, 11, -4, 1, 0, 0},
+ {1, -7, 124, 13, -4, 1, 0, 0},
+ {2, -8, 123, 15, -5, 1, 0, 0},
+ {2, -9, 122, 18, -6, 1, 0, 0},
+ {2, -10, 121, 20, -6, 1, 0, 0},
+ {2, -11, 120, 22, -7, 2, 0, 0},
+ {2, -12, 119, 25, -8, 2, 0, 0},
+ {3, -13, 117, 27, -8, 2, 0, 0},
+ {3, -13, 116, 29, -9, 2, 0, 0},
+ {3, -14, 114, 32, -10, 3, 0, 0},
+ {3, -15, 113, 35, -10, 2, 0, 0},
+ {3, -15, 111, 37, -11, 3, 0, 0},
+ {3, -16, 109, 40, -11, 3, 0, 0},
+ {3, -16, 108, 42, -12, 3, 0, 0},
+ {4, -17, 106, 45, -13, 3, 0, 0},
+ {4, -17, 104, 47, -13, 3, 0, 0},
+ {4, -17, 102, 50, -14, 3, 0, 0},
+ {4, -17, 100, 52, -14, 3, 0, 0},
+ {4, -18, 98, 55, -15, 4, 0, 0},
+ {4, -18, 96, 58, -15, 3, 0, 0},
+ {4, -18, 94, 60, -16, 4, 0, 0},
+ {4, -18, 91, 63, -16, 4, 0, 0},
+ {4, -18, 89, 65, -16, 4, 0, 0},
+ {4, -18, 87, 68, -17, 4, 0, 0},
+ {4, -18, 85, 70, -17, 4, 0, 0},
+ {4, -18, 82, 73, -17, 4, 0, 0},
+ {4, -18, 80, 75, -17, 4, 0, 0},
+ {4, -18, 78, 78, -18, 4, 0, 0},
+ {4, -17, 75, 80, -18, 4, 0, 0},
+ {4, -17, 73, 82, -18, 4, 0, 0},
+ {4, -17, 70, 85, -18, 4, 0, 0},
+ {4, -17, 68, 87, -18, 4, 0, 0},
+ {4, -16, 65, 89, -18, 4, 0, 0},
+ {4, -16, 63, 91, -18, 4, 0, 0},
+ {4, -16, 60, 94, -18, 4, 0, 0},
+ {3, -15, 58, 96, -18, 4, 0, 0},
+ {4, -15, 55, 98, -18, 4, 0, 0},
+ {3, -14, 52, 100, -17, 4, 0, 0},
+ {3, -14, 50, 102, -17, 4, 0, 0},
+ {3, -13, 47, 104, -17, 4, 0, 0},
+ {3, -13, 45, 106, -17, 4, 0, 0},
+ {3, -12, 42, 108, -16, 3, 0, 0},
+ {3, -11, 40, 109, -16, 3, 0, 0},
+ {3, -11, 37, 111, -15, 3, 0, 0},
+ {2, -10, 35, 113, -15, 3, 0, 0},
+ {3, -10, 32, 114, -14, 3, 0, 0},
+ {2, -9, 29, 116, -13, 3, 0, 0},
+ {2, -8, 27, 117, -13, 3, 0, 0},
+ {2, -8, 25, 119, -12, 2, 0, 0},
+ {2, -7, 22, 120, -11, 2, 0, 0},
+ {1, -6, 20, 121, -10, 2, 0, 0},
+ {1, -6, 18, 122, -9, 2, 0, 0},
+ {1, -5, 15, 123, -8, 2, 0, 0},
+ {1, -4, 13, 124, -7, 1, 0, 0},
+ {1, -4, 11, 125, -6, 1, 0, 0},
+ {1, -3, 8, 126, -5, 1, 0, 0},
+ {1, -2, 6, 126, -4, 1, 0, 0},
+ {0, -1, 4, 127, -3, 1, 0, 0},
+ {0, 0, 2, 127, -1, 0, 0, 0},
+ // [0, 1).
+ {0, 0, 0, 127, 1, 0, 0, 0},
+ {0, 0, -1, 127, 2, 0, 0, 0},
+ {0, 1, -3, 127, 4, -2, 1, 0},
+ {0, 1, -5, 127, 6, -2, 1, 0},
+ {0, 2, -6, 126, 8, -3, 1, 0},
+ {-1, 2, -7, 126, 11, -4, 2, -1},
+ {-1, 3, -8, 125, 13, -5, 2, -1},
+ {-1, 3, -10, 124, 16, -6, 3, -1},
+ {-1, 4, -11, 123, 18, -7, 3, -1},
+ {-1, 4, -12, 122, 20, -7, 3, -1},
+ {-1, 4, -13, 121, 23, -8, 3, -1},
+ {-2, 5, -14, 120, 25, -9, 4, -1},
+ {-1, 5, -15, 119, 27, -10, 4, -1},
+ {-1, 5, -16, 118, 30, -11, 4, -1},
+ {-2, 6, -17, 116, 33, -12, 5, -1},
+ {-2, 6, -17, 114, 35, -12, 5, -1},
+ {-2, 6, -18, 113, 38, -13, 5, -1},
+ {-2, 7, -19, 111, 41, -14, 6, -2},
+ {-2, 7, -19, 110, 43, -15, 6, -2},
+ {-2, 7, -20, 108, 46, -15, 6, -2},
+ {-2, 7, -20, 106, 49, -16, 6, -2},
+ {-2, 7, -21, 104, 51, -16, 7, -2},
+ {-2, 7, -21, 102, 54, -17, 7, -2},
+ {-2, 8, -21, 100, 56, -18, 7, -2},
+ {-2, 8, -22, 98, 59, -18, 7, -2},
+ {-2, 8, -22, 96, 62, -19, 7, -2},
+ {-2, 8, -22, 94, 64, -19, 7, -2},
+ {-2, 8, -22, 91, 67, -20, 8, -2},
+ {-2, 8, -22, 89, 69, -20, 8, -2},
+ {-2, 8, -22, 87, 72, -21, 8, -2},
+ {-2, 8, -21, 84, 74, -21, 8, -2},
+ {-2, 8, -22, 82, 77, -21, 8, -2},
+ {-2, 8, -21, 79, 79, -21, 8, -2},
+ {-2, 8, -21, 77, 82, -22, 8, -2},
+ {-2, 8, -21, 74, 84, -21, 8, -2},
+ {-2, 8, -21, 72, 87, -22, 8, -2},
+ {-2, 8, -20, 69, 89, -22, 8, -2},
+ {-2, 8, -20, 67, 91, -22, 8, -2},
+ {-2, 7, -19, 64, 94, -22, 8, -2},
+ {-2, 7, -19, 62, 96, -22, 8, -2},
+ {-2, 7, -18, 59, 98, -22, 8, -2},
+ {-2, 7, -18, 56, 100, -21, 8, -2},
+ {-2, 7, -17, 54, 102, -21, 7, -2},
+ {-2, 7, -16, 51, 104, -21, 7, -2},
+ {-2, 6, -16, 49, 106, -20, 7, -2},
+ {-2, 6, -15, 46, 108, -20, 7, -2},
+ {-2, 6, -15, 43, 110, -19, 7, -2},
+ {-2, 6, -14, 41, 111, -19, 7, -2},
+ {-1, 5, -13, 38, 113, -18, 6, -2},
+ {-1, 5, -12, 35, 114, -17, 6, -2},
+ {-1, 5, -12, 33, 116, -17, 6, -2},
+ {-1, 4, -11, 30, 118, -16, 5, -1},
+ {-1, 4, -10, 27, 119, -15, 5, -1},
+ {-1, 4, -9, 25, 120, -14, 5, -2},
+ {-1, 3, -8, 23, 121, -13, 4, -1},
+ {-1, 3, -7, 20, 122, -12, 4, -1},
+ {-1, 3, -7, 18, 123, -11, 4, -1},
+ {-1, 3, -6, 16, 124, -10, 3, -1},
+ {-1, 2, -5, 13, 125, -8, 3, -1},
+ {-1, 2, -4, 11, 126, -7, 2, -1},
+ {0, 1, -3, 8, 126, -6, 2, 0},
+ {0, 1, -2, 6, 127, -5, 1, 0},
+ {0, 1, -2, 4, 127, -3, 1, 0},
+ {0, 0, 0, 2, 127, -1, 0, 0},
+ // [1, 2).
+ {0, 0, 0, 1, 127, 0, 0, 0},
+ {0, 0, 0, -1, 127, 2, 0, 0},
+ {0, 0, 1, -3, 127, 4, -1, 0},
+ {0, 0, 1, -4, 126, 6, -2, 1},
+ {0, 0, 1, -5, 126, 8, -3, 1},
+ {0, 0, 1, -6, 125, 11, -4, 1},
+ {0, 0, 1, -7, 124, 13, -4, 1},
+ {0, 0, 2, -8, 123, 15, -5, 1},
+ {0, 0, 2, -9, 122, 18, -6, 1},
+ {0, 0, 2, -10, 121, 20, -6, 1},
+ {0, 0, 2, -11, 120, 22, -7, 2},
+ {0, 0, 2, -12, 119, 25, -8, 2},
+ {0, 0, 3, -13, 117, 27, -8, 2},
+ {0, 0, 3, -13, 116, 29, -9, 2},
+ {0, 0, 3, -14, 114, 32, -10, 3},
+ {0, 0, 3, -15, 113, 35, -10, 2},
+ {0, 0, 3, -15, 111, 37, -11, 3},
+ {0, 0, 3, -16, 109, 40, -11, 3},
+ {0, 0, 3, -16, 108, 42, -12, 3},
+ {0, 0, 4, -17, 106, 45, -13, 3},
+ {0, 0, 4, -17, 104, 47, -13, 3},
+ {0, 0, 4, -17, 102, 50, -14, 3},
+ {0, 0, 4, -17, 100, 52, -14, 3},
+ {0, 0, 4, -18, 98, 55, -15, 4},
+ {0, 0, 4, -18, 96, 58, -15, 3},
+ {0, 0, 4, -18, 94, 60, -16, 4},
+ {0, 0, 4, -18, 91, 63, -16, 4},
+ {0, 0, 4, -18, 89, 65, -16, 4},
+ {0, 0, 4, -18, 87, 68, -17, 4},
+ {0, 0, 4, -18, 85, 70, -17, 4},
+ {0, 0, 4, -18, 82, 73, -17, 4},
+ {0, 0, 4, -18, 80, 75, -17, 4},
+ {0, 0, 4, -18, 78, 78, -18, 4},
+ {0, 0, 4, -17, 75, 80, -18, 4},
+ {0, 0, 4, -17, 73, 82, -18, 4},
+ {0, 0, 4, -17, 70, 85, -18, 4},
+ {0, 0, 4, -17, 68, 87, -18, 4},
+ {0, 0, 4, -16, 65, 89, -18, 4},
+ {0, 0, 4, -16, 63, 91, -18, 4},
+ {0, 0, 4, -16, 60, 94, -18, 4},
+ {0, 0, 3, -15, 58, 96, -18, 4},
+ {0, 0, 4, -15, 55, 98, -18, 4},
+ {0, 0, 3, -14, 52, 100, -17, 4},
+ {0, 0, 3, -14, 50, 102, -17, 4},
+ {0, 0, 3, -13, 47, 104, -17, 4},
+ {0, 0, 3, -13, 45, 106, -17, 4},
+ {0, 0, 3, -12, 42, 108, -16, 3},
+ {0, 0, 3, -11, 40, 109, -16, 3},
+ {0, 0, 3, -11, 37, 111, -15, 3},
+ {0, 0, 2, -10, 35, 113, -15, 3},
+ {0, 0, 3, -10, 32, 114, -14, 3},
+ {0, 0, 2, -9, 29, 116, -13, 3},
+ {0, 0, 2, -8, 27, 117, -13, 3},
+ {0, 0, 2, -8, 25, 119, -12, 2},
+ {0, 0, 2, -7, 22, 120, -11, 2},
+ {0, 0, 1, -6, 20, 121, -10, 2},
+ {0, 0, 1, -6, 18, 122, -9, 2},
+ {0, 0, 1, -5, 15, 123, -8, 2},
+ {0, 0, 1, -4, 13, 124, -7, 1},
+ {0, 0, 1, -4, 11, 125, -6, 1},
+ {0, 0, 1, -3, 8, 126, -5, 1},
+ {0, 0, 1, -2, 6, 126, -4, 1},
+ {0, 0, 0, -1, 4, 127, -3, 1},
+ {0, 0, 0, 0, 2, 127, -1, 0},
+ // dummy, replicate row index 191.
+ {0, 0, 0, 0, 2, 127, -1, 0}};
+
+// Every value in |kSubPixelFilters| is even. Divide by 2 to simplify
+// calculations by reducing the range by 1 bit.
+alignas(8) const int8_t kHalfSubPixelFilters[6][16][8] = {
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 1, -3, 63, 4, -1, 0, 0},
+ {0, 1, -5, 61, 9, -2, 0, 0},
+ {0, 1, -6, 58, 14, -4, 1, 0},
+ {0, 1, -7, 55, 19, -5, 1, 0},
+ {0, 1, -7, 51, 24, -6, 1, 0},
+ {0, 1, -8, 47, 29, -6, 1, 0},
+ {0, 1, -7, 42, 33, -6, 1, 0},
+ {0, 1, -7, 38, 38, -7, 1, 0},
+ {0, 1, -6, 33, 42, -7, 1, 0},
+ {0, 1, -6, 29, 47, -8, 1, 0},
+ {0, 1, -6, 24, 51, -7, 1, 0},
+ {0, 1, -5, 19, 55, -7, 1, 0},
+ {0, 1, -4, 14, 58, -6, 1, 0},
+ {0, 0, -2, 9, 61, -5, 1, 0},
+ {0, 0, -1, 4, 63, -3, 1, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 1, 14, 31, 17, 1, 0, 0},
+ {0, 0, 13, 31, 18, 2, 0, 0},
+ {0, 0, 11, 31, 20, 2, 0, 0},
+ {0, 0, 10, 30, 21, 3, 0, 0},
+ {0, 0, 9, 29, 22, 4, 0, 0},
+ {0, 0, 8, 28, 23, 5, 0, 0},
+ {0, -1, 8, 27, 24, 6, 0, 0},
+ {0, -1, 7, 26, 26, 7, -1, 0},
+ {0, 0, 6, 24, 27, 8, -1, 0},
+ {0, 0, 5, 23, 28, 8, 0, 0},
+ {0, 0, 4, 22, 29, 9, 0, 0},
+ {0, 0, 3, 21, 30, 10, 0, 0},
+ {0, 0, 2, 20, 31, 11, 0, 0},
+ {0, 0, 2, 18, 31, 13, 0, 0},
+ {0, 0, 1, 17, 31, 14, 1, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {-1, 1, -3, 63, 4, -1, 1, 0},
+ {-1, 3, -6, 62, 8, -3, 2, -1},
+ {-1, 4, -9, 60, 13, -5, 3, -1},
+ {-2, 5, -11, 58, 19, -7, 3, -1},
+ {-2, 5, -11, 54, 24, -9, 4, -1},
+ {-2, 5, -12, 50, 30, -10, 4, -1},
+ {-2, 5, -12, 45, 35, -11, 5, -1},
+ {-2, 6, -12, 40, 40, -12, 6, -2},
+ {-1, 5, -11, 35, 45, -12, 5, -2},
+ {-1, 4, -10, 30, 50, -12, 5, -2},
+ {-1, 4, -9, 24, 54, -11, 5, -2},
+ {-1, 3, -7, 19, 58, -11, 5, -2},
+ {-1, 3, -5, 13, 60, -9, 4, -1},
+ {-1, 2, -3, 8, 62, -6, 3, -1},
+ {0, 1, -1, 4, 63, -3, 1, -1}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 0, 60, 4, 0, 0, 0},
+ {0, 0, 0, 56, 8, 0, 0, 0},
+ {0, 0, 0, 52, 12, 0, 0, 0},
+ {0, 0, 0, 48, 16, 0, 0, 0},
+ {0, 0, 0, 44, 20, 0, 0, 0},
+ {0, 0, 0, 40, 24, 0, 0, 0},
+ {0, 0, 0, 36, 28, 0, 0, 0},
+ {0, 0, 0, 32, 32, 0, 0, 0},
+ {0, 0, 0, 28, 36, 0, 0, 0},
+ {0, 0, 0, 24, 40, 0, 0, 0},
+ {0, 0, 0, 20, 44, 0, 0, 0},
+ {0, 0, 0, 16, 48, 0, 0, 0},
+ {0, 0, 0, 12, 52, 0, 0, 0},
+ {0, 0, 0, 8, 56, 0, 0, 0},
+ {0, 0, 0, 4, 60, 0, 0, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, -2, 63, 4, -1, 0, 0},
+ {0, 0, -4, 61, 9, -2, 0, 0},
+ {0, 0, -5, 58, 14, -3, 0, 0},
+ {0, 0, -6, 55, 19, -4, 0, 0},
+ {0, 0, -6, 51, 24, -5, 0, 0},
+ {0, 0, -7, 47, 29, -5, 0, 0},
+ {0, 0, -6, 42, 33, -5, 0, 0},
+ {0, 0, -6, 38, 38, -6, 0, 0},
+ {0, 0, -5, 33, 42, -6, 0, 0},
+ {0, 0, -5, 29, 47, -7, 0, 0},
+ {0, 0, -5, 24, 51, -6, 0, 0},
+ {0, 0, -4, 19, 55, -6, 0, 0},
+ {0, 0, -3, 14, 58, -5, 0, 0},
+ {0, 0, -2, 9, 61, -4, 0, 0},
+ {0, 0, -1, 4, 63, -2, 0, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 15, 31, 17, 1, 0, 0},
+ {0, 0, 13, 31, 18, 2, 0, 0},
+ {0, 0, 11, 31, 20, 2, 0, 0},
+ {0, 0, 10, 30, 21, 3, 0, 0},
+ {0, 0, 9, 29, 22, 4, 0, 0},
+ {0, 0, 8, 28, 23, 5, 0, 0},
+ {0, 0, 7, 27, 24, 6, 0, 0},
+ {0, 0, 6, 26, 26, 6, 0, 0},
+ {0, 0, 6, 24, 27, 7, 0, 0},
+ {0, 0, 5, 23, 28, 8, 0, 0},
+ {0, 0, 4, 22, 29, 9, 0, 0},
+ {0, 0, 3, 21, 30, 10, 0, 0},
+ {0, 0, 2, 20, 31, 11, 0, 0},
+ {0, 0, 2, 18, 31, 13, 0, 0},
+ {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// Absolute values of |kHalfSubPixelFilters|. Used in situations where we know
+// the pattern of the signs and account for it in other ways.
+const uint8_t kAbsHalfSubPixelFilters[6][16][8] = {
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 1, 3, 63, 4, 1, 0, 0},
+ {0, 1, 5, 61, 9, 2, 0, 0},
+ {0, 1, 6, 58, 14, 4, 1, 0},
+ {0, 1, 7, 55, 19, 5, 1, 0},
+ {0, 1, 7, 51, 24, 6, 1, 0},
+ {0, 1, 8, 47, 29, 6, 1, 0},
+ {0, 1, 7, 42, 33, 6, 1, 0},
+ {0, 1, 7, 38, 38, 7, 1, 0},
+ {0, 1, 6, 33, 42, 7, 1, 0},
+ {0, 1, 6, 29, 47, 8, 1, 0},
+ {0, 1, 6, 24, 51, 7, 1, 0},
+ {0, 1, 5, 19, 55, 7, 1, 0},
+ {0, 1, 4, 14, 58, 6, 1, 0},
+ {0, 0, 2, 9, 61, 5, 1, 0},
+ {0, 0, 1, 4, 63, 3, 1, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 1, 14, 31, 17, 1, 0, 0},
+ {0, 0, 13, 31, 18, 2, 0, 0},
+ {0, 0, 11, 31, 20, 2, 0, 0},
+ {0, 0, 10, 30, 21, 3, 0, 0},
+ {0, 0, 9, 29, 22, 4, 0, 0},
+ {0, 0, 8, 28, 23, 5, 0, 0},
+ {0, 1, 8, 27, 24, 6, 0, 0},
+ {0, 1, 7, 26, 26, 7, 1, 0},
+ {0, 0, 6, 24, 27, 8, 1, 0},
+ {0, 0, 5, 23, 28, 8, 0, 0},
+ {0, 0, 4, 22, 29, 9, 0, 0},
+ {0, 0, 3, 21, 30, 10, 0, 0},
+ {0, 0, 2, 20, 31, 11, 0, 0},
+ {0, 0, 2, 18, 31, 13, 0, 0},
+ {0, 0, 1, 17, 31, 14, 1, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {1, 1, 3, 63, 4, 1, 1, 0},
+ {1, 3, 6, 62, 8, 3, 2, 1},
+ {1, 4, 9, 60, 13, 5, 3, 1},
+ {2, 5, 11, 58, 19, 7, 3, 1},
+ {2, 5, 11, 54, 24, 9, 4, 1},
+ {2, 5, 12, 50, 30, 10, 4, 1},
+ {2, 5, 12, 45, 35, 11, 5, 1},
+ {2, 6, 12, 40, 40, 12, 6, 2},
+ {1, 5, 11, 35, 45, 12, 5, 2},
+ {1, 4, 10, 30, 50, 12, 5, 2},
+ {1, 4, 9, 24, 54, 11, 5, 2},
+ {1, 3, 7, 19, 58, 11, 5, 2},
+ {1, 3, 5, 13, 60, 9, 4, 1},
+ {1, 2, 3, 8, 62, 6, 3, 1},
+ {0, 1, 1, 4, 63, 3, 1, 1}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 0, 60, 4, 0, 0, 0},
+ {0, 0, 0, 56, 8, 0, 0, 0},
+ {0, 0, 0, 52, 12, 0, 0, 0},
+ {0, 0, 0, 48, 16, 0, 0, 0},
+ {0, 0, 0, 44, 20, 0, 0, 0},
+ {0, 0, 0, 40, 24, 0, 0, 0},
+ {0, 0, 0, 36, 28, 0, 0, 0},
+ {0, 0, 0, 32, 32, 0, 0, 0},
+ {0, 0, 0, 28, 36, 0, 0, 0},
+ {0, 0, 0, 24, 40, 0, 0, 0},
+ {0, 0, 0, 20, 44, 0, 0, 0},
+ {0, 0, 0, 16, 48, 0, 0, 0},
+ {0, 0, 0, 12, 52, 0, 0, 0},
+ {0, 0, 0, 8, 56, 0, 0, 0},
+ {0, 0, 0, 4, 60, 0, 0, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 2, 63, 4, 1, 0, 0},
+ {0, 0, 4, 61, 9, 2, 0, 0},
+ {0, 0, 5, 58, 14, 3, 0, 0},
+ {0, 0, 6, 55, 19, 4, 0, 0},
+ {0, 0, 6, 51, 24, 5, 0, 0},
+ {0, 0, 7, 47, 29, 5, 0, 0},
+ {0, 0, 6, 42, 33, 5, 0, 0},
+ {0, 0, 6, 38, 38, 6, 0, 0},
+ {0, 0, 5, 33, 42, 6, 0, 0},
+ {0, 0, 5, 29, 47, 7, 0, 0},
+ {0, 0, 5, 24, 51, 6, 0, 0},
+ {0, 0, 4, 19, 55, 6, 0, 0},
+ {0, 0, 3, 14, 58, 5, 0, 0},
+ {0, 0, 2, 9, 61, 4, 0, 0},
+ {0, 0, 1, 4, 63, 2, 0, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 15, 31, 17, 1, 0, 0},
+ {0, 0, 13, 31, 18, 2, 0, 0},
+ {0, 0, 11, 31, 20, 2, 0, 0},
+ {0, 0, 10, 30, 21, 3, 0, 0},
+ {0, 0, 9, 29, 22, 4, 0, 0},
+ {0, 0, 8, 28, 23, 5, 0, 0},
+ {0, 0, 7, 27, 24, 6, 0, 0},
+ {0, 0, 6, 26, 26, 6, 0, 0},
+ {0, 0, 6, 24, 27, 7, 0, 0},
+ {0, 0, 5, 23, 28, 8, 0, 0},
+ {0, 0, 4, 22, 29, 9, 0, 0},
+ {0, 0, 3, 21, 30, 10, 0, 0},
+ {0, 0, 2, 20, 31, 11, 0, 0},
+ {0, 0, 2, 18, 31, 13, 0, 0},
+ {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// 9.3 -- Dr_Intra_Derivative[]
+// This is a more compact version of the table from the spec. angle / 2 - 1 is
+// used as the lookup. Note angle / 3 - 1 would work too, but the calculation
+// becomes more costly.
+const int16_t kDirectionalIntraPredictorDerivative[44] = {
+ // Approx angle
+ 1023, 0, // 3, ...
+ 547, // 6, ...
+ 372, 0, 0, // 9, ...
+ 273, // 14, ...
+ 215, 0, // 17, ...
+ 178, // 20, ...
+ 151, 0, // 23, ... (113 & 203 are base angles)
+ 132, // 26, ...
+ 116, 0, // 29, ...
+ 102, 0, // 32, ...
+ 90, // 36, ...
+ 80, 0, // 39, ...
+ 71, // 42, ...
+ 64, 0, // 45, ... (45 & 135 are base angles)
+ 57, // 48, ...
+ 51, 0, // 51, ...
+ 45, 0, // 54, ...
+ 40, // 58, ...
+ 35, 0, // 61, ...
+ 31, // 64, ...
+ 27, 0, // 67, ... (67 & 157 are base angles)
+ 23, // 70, ...
+ 19, 0, // 73, ...
+ 15, 0, // 76, ...
+ 11, 0, // 81, ...
+ 7, // 84, ...
+ 3, // 87, ...
+};
+
+const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = {
+ {0, 1}, {2, 2}, {3, 3}};
+
+} // namespace libgav1
diff --git a/src/utils/constants.h b/src/utils/constants.h
new file mode 100644
index 0000000..34cf56d
--- /dev/null
+++ b/src/utils/constants.h
@@ -0,0 +1,744 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CONSTANTS_H_
+#define LIBGAV1_SRC_UTILS_CONSTANTS_H_
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/utils/bit_mask_set.h"
+
+namespace libgav1 {
+
+// Returns the number of elements between begin (inclusive) and end (inclusive).
+constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; }
+
+enum {
+// Maximum number of threads that the library will ever create.
+#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0
+ kMaxThreads = LIBGAV1_MAX_THREADS
+#else
+ kMaxThreads = 128
+#endif
+}; // anonymous enum
+
+enum {
+ kInvalidMvValue = -32768,
+ kCdfMaxProbability = 32768,
+ kBlockWidthCount = 5,
+ kMaxSegments = 8,
+ kMinQuantizer = 0,
+ kMinLossyQuantizer = 1,
+ kMaxQuantizer = 255,
+ // Quantizer matrix is used only when level < 15.
+ kNumQuantizerLevelsForQuantizerMatrix = 15,
+ kFrameLfCount = 4,
+ kMaxLoopFilterValue = 63,
+ kNum4x4In64x64 = 256,
+ kMaxAngleDelta = 3,
+ kDirectionalIntraModes = 8,
+ kMaxSuperBlockSizeLog2 = 7,
+ kMinSuperBlockSizeLog2 = 6,
+ kGlobalMotionReadControl = 3,
+ kSuperResScaleNumerator = 8,
+ kBooleanSymbolCount = 2,
+ kRestorationTypeSymbolCount = 3,
+ kSgrProjParamsBits = 4,
+ kSgrProjPrecisionBits = 7,
+ // Padding on left and right side of a restoration block.
+ // 3 is enough, but padding to 4 is more efficient, and makes the temporary
+ // source buffer 8-pixel aligned.
+ kRestorationHorizontalBorder = 4,
+ // Padding on top and bottom side of a restoration block.
+ kRestorationVerticalBorder = 2,
+ kCdefBorder = 2, // Padding on each side of a cdef block.
+ kConvolveBorderLeftTop = 3, // Left/top padding of a convolve block.
+ // Right/bottom padding of a convolve block. This needs to be 4 at minimum,
+ // but was increased to simplify the SIMD loads in
+ // ConvolveCompoundScale2D_NEON() and ConvolveScale2D_NEON().
+ kConvolveBorderRight = 8,
+ kConvolveBorderBottom = 4,
+ kSubPixelTaps = 8,
+ kWienerFilterBits = 7,
+ kWienerFilterTaps = 7,
+ kMaxPaletteSize = 8,
+ kMinPaletteSize = 2,
+ kMaxPaletteSquare = 64,
+ kBorderPixels = 64,
+ // The final blending process for film grain needs room to overwrite and read
+ // with SIMD instructions. The maximum overwrite is 7 pixels, but the border
+ // is required to be a multiple of 32 by YuvBuffer::Realloc, so that
+ // subsampled chroma borders are 16-aligned.
+ kBorderPixelsFilmGrain = 32,
+ // These constants are the minimum left, right, top, and bottom border sizes
+ // in pixels as an extension of the frame boundary. The minimum border sizes
+ // are derived from the following requirements:
+ // - Warp_C() may read up to 13 pixels before or after a row.
+ // - Warp_NEON() may read up to 13 pixels before a row. It may read up to 14
+ // pixels after a row, but the value of the last read pixel is not used.
+ // - Warp_C() and Warp_NEON() may read up to 13 pixels above the top row and
+ // 13 pixels below the bottom row.
+ kMinLeftBorderPixels = 13,
+ kMinRightBorderPixels = 13,
+ kMinTopBorderPixels = 13,
+ kMinBottomBorderPixels = 13,
+ kWarpedModelPrecisionBits = 16,
+ kMaxRefMvStackSize = 8,
+ kMaxLeastSquaresSamples = 8,
+ kMaxTemporalMvCandidates = 19,
+ // The SIMD implementations of motion vection projection functions always
+ // process 2 or 4 elements together, so we pad the corresponding buffers to
+ // size 20.
+ kMaxTemporalMvCandidatesWithPadding = 20,
+ kMaxSuperBlockSizeInPixels = 128,
+ kMaxScaledSuperBlockSizeInPixels = 128 * 2,
+ kMaxSuperBlockSizeSquareInPixels = 128 * 128,
+ kNum4x4InLoopFilterUnit = 16,
+ kNum4x4InLoopRestorationUnit = 16,
+ kProjectionMvClamp = (1 << 14) - 1, // == 16383
+ kProjectionMvMaxHorizontalOffset = 8,
+ kCdefUnitSize = 64,
+ kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kCdefBorder,
+ kRestorationUnitOffset = 8,
+ // Loop restoration's processing unit size is fixed as 64x64.
+ kRestorationUnitHeight = 64,
+ kRestorationUnitWidth = 256,
+ kRestorationUnitHeightWithBorders =
+ kRestorationUnitHeight + 2 * kRestorationVerticalBorder,
+ kRestorationUnitWidthWithBorders =
+ kRestorationUnitWidth + 2 * kRestorationHorizontalBorder,
+ kSuperResFilterBits = 6,
+ kSuperResFilterShifts = 1 << kSuperResFilterBits,
+ kSuperResFilterTaps = 8,
+ kSuperResScaleBits = 14,
+ kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits,
+ kSuperResScaleMask = (1 << 14) - 1,
+ kSuperResHorizontalBorder = 4,
+ kSuperResVerticalBorder = 1,
+ // The SIMD implementations of superres calculate up to 15 extra upscaled
+ // pixels which will over-read up to 15 downscaled pixels in the end of each
+ // row. Set the padding to 16 for alignment purposes.
+ kSuperResHorizontalPadding = 16,
+ // TODO(chengchen): consider merging these constants:
+ // kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7,
+ // They are designed to match AV1 convolution, which increases coeff
+ // values up to 7 bits. We could consider to combine them and use kFilterBits
+ // only.
+ kFilterBits = 7,
+ // Sub pixel is used in AV1 to represent a pixel location that is not at
+ // integer position. Sub pixel is in 1/16 (1 << kSubPixelBits) unit of
+ // integer pixel. Sub pixel values are interpolated using adjacent integer
+ // pixel values. The interpolation is a filtering process.
+ kSubPixelBits = 4,
+ kSubPixelMask = (1 << kSubPixelBits) - 1,
+ // Precision bits when computing inter prediction locations.
+ kScaleSubPixelBits = 10,
+ kWarpParamRoundingBits = 6,
+ // Number of fractional bits of lookup in divisor lookup table.
+ kDivisorLookupBits = 8,
+ // Number of fractional bits of entries in divisor lookup table.
+ kDivisorLookupPrecisionBits = 14,
+ // Number of phases used in warped filtering.
+ kWarpedPixelPrecisionShifts = 1 << 6,
+ kResidualPaddingVertical = 4,
+ kWedgeMaskMasterSize = 64,
+ kMaxFrameDistance = 31,
+ kReferenceFrameScalePrecision = 14,
+ kNumWienerCoefficients = 3,
+ kLoopFilterMaxModeDeltas = 2,
+ kMaxCdefStrengths = 8,
+ kCdefLargeValue = 0x4000, // Used to indicate where CDEF is not available.
+ kMaxTileColumns = 64,
+ kMaxTileRows = 64,
+ kMaxOperatingPoints = 32,
+ // There can be a maximum of 4 spatial layers and 8 temporal layers.
+ kMaxLayers = 32,
+ // The cache line size should ideally be queried at run time. 64 is a common
+ // cache line size of x86 CPUs. Web searches showed the cache line size of ARM
+ // CPUs is 32 or 64 bytes. So aligning to 64-byte boundary will work for all
+ // CPUs that we care about, even though it is excessive for some ARM
+ // CPUs.
+ //
+ // On Linux, the cache line size can be looked up with the command:
+ // getconf LEVEL1_DCACHE_LINESIZE
+ kCacheLineSize = 64,
+}; // anonymous enum
+
+enum FrameType : uint8_t {
+ kFrameKey,
+ kFrameInter,
+ kFrameIntraOnly,
+ kFrameSwitch
+};
+
+enum Plane : uint8_t { kPlaneY, kPlaneU, kPlaneV };
+enum : uint8_t { kMaxPlanesMonochrome = kPlaneY + 1, kMaxPlanes = kPlaneV + 1 };
+
+// The plane types, called luma and chroma in the spec.
+enum PlaneType : uint8_t { kPlaneTypeY, kPlaneTypeUV, kNumPlaneTypes };
+
+enum ReferenceFrameType : int8_t {
+ kReferenceFrameNone = -1,
+ kReferenceFrameIntra,
+ kReferenceFrameLast,
+ kReferenceFrameLast2,
+ kReferenceFrameLast3,
+ kReferenceFrameGolden,
+ kReferenceFrameBackward,
+ kReferenceFrameAlternate2,
+ kReferenceFrameAlternate,
+ kNumReferenceFrameTypes,
+ kNumInterReferenceFrameTypes =
+ EnumRangeLength(kReferenceFrameLast, kReferenceFrameAlternate),
+ kNumForwardReferenceTypes =
+ EnumRangeLength(kReferenceFrameLast, kReferenceFrameGolden),
+ kNumBackwardReferenceTypes =
+ EnumRangeLength(kReferenceFrameBackward, kReferenceFrameAlternate)
+};
+
+enum {
+ // Unidirectional compound reference pairs that are signaled explicitly:
+ // {kReferenceFrameLast, kReferenceFrameLast2},
+ // {kReferenceFrameLast, kReferenceFrameLast3},
+ // {kReferenceFrameLast, kReferenceFrameGolden},
+ // {kReferenceFrameBackward, kReferenceFrameAlternate}
+ kExplicitUnidirectionalCompoundReferences = 4,
+ // Other unidirectional compound reference pairs:
+ // {kReferenceFrameLast2, kReferenceFrameLast3},
+ // {kReferenceFrameLast2, kReferenceFrameGolden},
+ // {kReferenceFrameLast3, kReferenceFrameGolden},
+ // {kReferenceFrameBackward, kReferenceFrameAlternate2},
+ // {kReferenceFrameAlternate2, kReferenceFrameAlternate}
+ kUnidirectionalCompoundReferences =
+ kExplicitUnidirectionalCompoundReferences + 5,
+}; // anonymous enum
+
+enum BlockSize : uint8_t {
+ kBlock4x4,
+ kBlock4x8,
+ kBlock4x16,
+ kBlock8x4,
+ kBlock8x8,
+ kBlock8x16,
+ kBlock8x32,
+ kBlock16x4,
+ kBlock16x8,
+ kBlock16x16,
+ kBlock16x32,
+ kBlock16x64,
+ kBlock32x8,
+ kBlock32x16,
+ kBlock32x32,
+ kBlock32x64,
+ kBlock64x16,
+ kBlock64x32,
+ kBlock64x64,
+ kBlock64x128,
+ kBlock128x64,
+ kBlock128x128,
+ kMaxBlockSizes,
+ kBlockInvalid
+};
+
+// Partition types. R: Recursive
+//
+// None Horizontal Vertical Split
+// +-------+ +-------+ +---+---+ +---+---+
+// | | | | | | | | R | R |
+// | | +-------+ | | | +---+---+
+// | | | | | | | | R | R |
+// +-------+ +-------+ +---+---+ +---+---+
+//
+// Horizontal Horizontal Vertical Vertical
+// with top with bottom with left with right
+// split split split split
+// +---+---+ +-------+ +---+---+ +---+---+
+// | | | | | | | | | | |
+// +---+---+ +---+---+ +---+ | | +---+
+// | | | | | | | | | | |
+// +-------+ +---+---+ +---+---+ +---+---+
+//
+// Horizontal4 Vertical4
+// +-----+ +-+-+-+
+// +-----+ | | | |
+// +-----+ | | | |
+// +-----+ +-+-+-+
+enum Partition : uint8_t {
+ kPartitionNone,
+ kPartitionHorizontal,
+ kPartitionVertical,
+ kPartitionSplit,
+ kPartitionHorizontalWithTopSplit,
+ kPartitionHorizontalWithBottomSplit,
+ kPartitionVerticalWithLeftSplit,
+ kPartitionVerticalWithRightSplit,
+ kPartitionHorizontal4,
+ kPartitionVertical4
+};
+enum : uint8_t { kMaxPartitionTypes = kPartitionVertical4 + 1 };
+
+enum PredictionMode : uint8_t {
+ // Intra prediction modes.
+ kPredictionModeDc,
+ kPredictionModeVertical,
+ kPredictionModeHorizontal,
+ kPredictionModeD45,
+ kPredictionModeD135,
+ kPredictionModeD113,
+ kPredictionModeD157,
+ kPredictionModeD203,
+ kPredictionModeD67,
+ kPredictionModeSmooth,
+ kPredictionModeSmoothVertical,
+ kPredictionModeSmoothHorizontal,
+ kPredictionModePaeth,
+ kPredictionModeChromaFromLuma,
+ // Single inter prediction modes.
+ kPredictionModeNearestMv,
+ kPredictionModeNearMv,
+ kPredictionModeGlobalMv,
+ kPredictionModeNewMv,
+ // Compound inter prediction modes.
+ kPredictionModeNearestNearestMv,
+ kPredictionModeNearNearMv,
+ kPredictionModeNearestNewMv,
+ kPredictionModeNewNearestMv,
+ kPredictionModeNearNewMv,
+ kPredictionModeNewNearMv,
+ kPredictionModeGlobalGlobalMv,
+ kPredictionModeNewNewMv,
+ kNumPredictionModes,
+ kNumCompoundInterPredictionModes =
+ EnumRangeLength(kPredictionModeNearestNearestMv, kPredictionModeNewNewMv),
+ kIntraPredictionModesY =
+ EnumRangeLength(kPredictionModeDc, kPredictionModePaeth),
+ kIntraPredictionModesUV =
+ EnumRangeLength(kPredictionModeDc, kPredictionModeChromaFromLuma),
+ kPredictionModeInvalid = 255
+};
+
+enum InterIntraMode : uint8_t {
+ kInterIntraModeDc,
+ kInterIntraModeVertical,
+ kInterIntraModeHorizontal,
+ kInterIntraModeSmooth,
+ kNumInterIntraModes
+};
+
+enum MotionMode : uint8_t {
+ kMotionModeSimple,
+ kMotionModeObmc, // Overlapped block motion compensation.
+ kMotionModeLocalWarp,
+ kNumMotionModes
+};
+
+enum TxMode : uint8_t {
+ kTxModeOnly4x4,
+ kTxModeLargest,
+ kTxModeSelect,
+ kNumTxModes
+};
+
+// These enums are named as kType1Type2 where Type1 is the transform type for
+// the rows and Type2 is the transform type for the columns.
+enum TransformType : uint8_t {
+ kTransformTypeDctDct,
+ kTransformTypeAdstDct,
+ kTransformTypeDctAdst,
+ kTransformTypeAdstAdst,
+ kTransformTypeFlipadstDct,
+ kTransformTypeDctFlipadst,
+ kTransformTypeFlipadstFlipadst,
+ kTransformTypeAdstFlipadst,
+ kTransformTypeFlipadstAdst,
+ kTransformTypeIdentityIdentity,
+ kTransformTypeIdentityDct,
+ kTransformTypeDctIdentity,
+ kTransformTypeIdentityAdst,
+ kTransformTypeAdstIdentity,
+ kTransformTypeIdentityFlipadst,
+ kTransformTypeFlipadstIdentity,
+ kNumTransformTypes
+};
+
+constexpr BitMaskSet kTransformFlipColumnsMask(kTransformTypeFlipadstDct,
+ kTransformTypeFlipadstAdst,
+ kTransformTypeFlipadstIdentity,
+ kTransformTypeFlipadstFlipadst);
+constexpr BitMaskSet kTransformFlipRowsMask(kTransformTypeDctFlipadst,
+ kTransformTypeAdstFlipadst,
+ kTransformTypeIdentityFlipadst,
+ kTransformTypeFlipadstFlipadst);
+
+enum TransformSize : uint8_t {
+ kTransformSize4x4,
+ kTransformSize4x8,
+ kTransformSize4x16,
+ kTransformSize8x4,
+ kTransformSize8x8,
+ kTransformSize8x16,
+ kTransformSize8x32,
+ kTransformSize16x4,
+ kTransformSize16x8,
+ kTransformSize16x16,
+ kTransformSize16x32,
+ kTransformSize16x64,
+ kTransformSize32x8,
+ kTransformSize32x16,
+ kTransformSize32x32,
+ kTransformSize32x64,
+ kTransformSize64x16,
+ kTransformSize64x32,
+ kTransformSize64x64,
+ kNumTransformSizes
+};
+
+enum TransformSet : uint8_t {
+ // DCT Only (1).
+ kTransformSetDctOnly,
+ // 2D-DCT and 2D-ADST without flip (4) + Identity (1) + 1D Horizontal/Vertical
+ // DCT (2) = Total (7).
+ kTransformSetIntra1,
+ // 2D-DCT and 2D-ADST without flip (4) + Identity (1) = Total (5).
+ kTransformSetIntra2,
+ // All transforms = Total (16).
+ kTransformSetInter1,
+ // 2D-DCT and 2D-ADST with flip (9) + Identity (1) + 1D Horizontal/Vertical
+ // DCT (2) = Total (12).
+ kTransformSetInter2,
+ // DCT (1) + Identity (1) = Total (2).
+ kTransformSetInter3,
+ kNumTransformSets
+};
+
+enum TransformClass : uint8_t {
+ kTransformClass2D,
+ kTransformClassHorizontal,
+ kTransformClassVertical,
+ kNumTransformClasses
+};
+
+enum FilterIntraPredictor : uint8_t {
+ kFilterIntraPredictorDc,
+ kFilterIntraPredictorVertical,
+ kFilterIntraPredictorHorizontal,
+ kFilterIntraPredictorD157,
+ kFilterIntraPredictorPaeth,
+ kNumFilterIntraPredictors
+};
+
+enum ObmcDirection : uint8_t {
+ kObmcDirectionVertical,
+ kObmcDirectionHorizontal,
+ kNumObmcDirections
+};
+
+// In AV1 the name of the filter refers to the direction of filter application.
+// Horizontal refers to the column edge and vertical the row edge.
+enum LoopFilterType : uint8_t {
+ kLoopFilterTypeVertical,
+ kLoopFilterTypeHorizontal,
+ kNumLoopFilterTypes
+};
+
+enum LoopFilterTransformSizeId : uint8_t {
+ kLoopFilterTransformSizeId4x4,
+ kLoopFilterTransformSizeId8x8,
+ kLoopFilterTransformSizeId16x16,
+ kNumLoopFilterTransformSizeIds
+};
+
+enum LoopRestorationType : uint8_t {
+ kLoopRestorationTypeNone,
+ kLoopRestorationTypeSwitchable,
+ kLoopRestorationTypeWiener,
+ kLoopRestorationTypeSgrProj, // self guided projection filter.
+ kNumLoopRestorationTypes
+};
+
+enum CompoundReferenceType : uint8_t {
+ kCompoundReferenceUnidirectional,
+ kCompoundReferenceBidirectional,
+ kNumCompoundReferenceTypes
+};
+
+enum CompoundPredictionType : uint8_t {
+ kCompoundPredictionTypeWedge,
+ kCompoundPredictionTypeDiffWeighted,
+ kCompoundPredictionTypeAverage,
+ kCompoundPredictionTypeIntra,
+ kCompoundPredictionTypeDistance,
+ kNumCompoundPredictionTypes,
+ // Number of compound prediction types that are explicitly signaled in the
+ // bitstream (in the compound_type syntax element).
+ kNumExplicitCompoundPredictionTypes = 2
+};
+
+enum InterpolationFilter : uint8_t {
+ kInterpolationFilterEightTap,
+ kInterpolationFilterEightTapSmooth,
+ kInterpolationFilterEightTapSharp,
+ kInterpolationFilterBilinear,
+ kInterpolationFilterSwitchable,
+ kNumInterpolationFilters,
+ // Number of interpolation filters that can be explicitly signaled in the
+ // compressed headers (when the uncompressed headers allow switchable
+ // interpolation filters) of the bitstream.
+ kNumExplicitInterpolationFilters = EnumRangeLength(
+ kInterpolationFilterEightTap, kInterpolationFilterEightTapSharp)
+};
+
+enum MvJointType : uint8_t {
+ kMvJointTypeZero,
+ kMvJointTypeHorizontalNonZeroVerticalZero,
+ kMvJointTypeHorizontalZeroVerticalNonZero,
+ kMvJointTypeNonZero,
+ kNumMvJointTypes
+};
+
+enum ObuType : int8_t {
+ kObuInvalid = -1,
+ kObuSequenceHeader = 1,
+ kObuTemporalDelimiter = 2,
+ kObuFrameHeader = 3,
+ kObuTileGroup = 4,
+ kObuMetadata = 5,
+ kObuFrame = 6,
+ kObuRedundantFrameHeader = 7,
+ kObuTileList = 8,
+ kObuPadding = 15,
+};
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const BlockSize size) {
+ switch (size) {
+ case kBlock4x4:
+ return "kBlock4x4";
+ case kBlock4x8:
+ return "kBlock4x8";
+ case kBlock4x16:
+ return "kBlock4x16";
+ case kBlock8x4:
+ return "kBlock8x4";
+ case kBlock8x8:
+ return "kBlock8x8";
+ case kBlock8x16:
+ return "kBlock8x16";
+ case kBlock8x32:
+ return "kBlock8x32";
+ case kBlock16x4:
+ return "kBlock16x4";
+ case kBlock16x8:
+ return "kBlock16x8";
+ case kBlock16x16:
+ return "kBlock16x16";
+ case kBlock16x32:
+ return "kBlock16x32";
+ case kBlock16x64:
+ return "kBlock16x64";
+ case kBlock32x8:
+ return "kBlock32x8";
+ case kBlock32x16:
+ return "kBlock32x16";
+ case kBlock32x32:
+ return "kBlock32x32";
+ case kBlock32x64:
+ return "kBlock32x64";
+ case kBlock64x16:
+ return "kBlock64x16";
+ case kBlock64x32:
+ return "kBlock64x32";
+ case kBlock64x64:
+ return "kBlock64x64";
+ case kBlock64x128:
+ return "kBlock64x128";
+ case kBlock128x64:
+ return "kBlock128x64";
+ case kBlock128x128:
+ return "kBlock128x128";
+ case kMaxBlockSizes:
+ return "kMaxBlockSizes";
+ case kBlockInvalid:
+ return "kBlockInvalid";
+ }
+ abort();
+}
+
+inline const char* ToString(const InterIntraMode mode) {
+ switch (mode) {
+ case kInterIntraModeDc:
+ return "kInterIntraModeDc";
+ case kInterIntraModeVertical:
+ return "kInterIntraModeVertical";
+ case kInterIntraModeHorizontal:
+ return "kInterIntraModeHorizontal";
+ case kInterIntraModeSmooth:
+ return "kInterIntraModeSmooth";
+ case kNumInterIntraModes:
+ return "kNumInterIntraModes";
+ }
+ abort();
+}
+
+inline const char* ToString(const ObmcDirection direction) {
+ switch (direction) {
+ case kObmcDirectionVertical:
+ return "kObmcDirectionVertical";
+ case kObmcDirectionHorizontal:
+ return "kObmcDirectionHorizontal";
+ case kNumObmcDirections:
+ return "kNumObmcDirections";
+ }
+ abort();
+}
+
+inline const char* ToString(const LoopRestorationType type) {
+ switch (type) {
+ case kLoopRestorationTypeNone:
+ return "kLoopRestorationTypeNone";
+ case kLoopRestorationTypeSwitchable:
+ return "kLoopRestorationTypeSwitchable";
+ case kLoopRestorationTypeWiener:
+ return "kLoopRestorationTypeWiener";
+ case kLoopRestorationTypeSgrProj:
+ return "kLoopRestorationTypeSgrProj";
+ case kNumLoopRestorationTypes:
+ return "kNumLoopRestorationTypes";
+ }
+ abort();
+}
+
+inline const char* ToString(const TransformType type) {
+ switch (type) {
+ case kTransformTypeDctDct:
+ return "kTransformTypeDctDct";
+ case kTransformTypeAdstDct:
+ return "kTransformTypeAdstDct";
+ case kTransformTypeDctAdst:
+ return "kTransformTypeDctAdst";
+ case kTransformTypeAdstAdst:
+ return "kTransformTypeAdstAdst";
+ case kTransformTypeFlipadstDct:
+ return "kTransformTypeFlipadstDct";
+ case kTransformTypeDctFlipadst:
+ return "kTransformTypeDctFlipadst";
+ case kTransformTypeFlipadstFlipadst:
+ return "kTransformTypeFlipadstFlipadst";
+ case kTransformTypeAdstFlipadst:
+ return "kTransformTypeAdstFlipadst";
+ case kTransformTypeFlipadstAdst:
+ return "kTransformTypeFlipadstAdst";
+ case kTransformTypeIdentityIdentity:
+ return "kTransformTypeIdentityIdentity";
+ case kTransformTypeIdentityDct:
+ return "kTransformTypeIdentityDct";
+ case kTransformTypeDctIdentity:
+ return "kTransformTypeDctIdentity";
+ case kTransformTypeIdentityAdst:
+ return "kTransformTypeIdentityAdst";
+ case kTransformTypeAdstIdentity:
+ return "kTransformTypeAdstIdentity";
+ case kTransformTypeIdentityFlipadst:
+ return "kTransformTypeIdentityFlipadst";
+ case kTransformTypeFlipadstIdentity:
+ return "kTransformTypeFlipadstIdentity";
+ // case to quiet compiler
+ case kNumTransformTypes:
+ return "kNumTransformTypes";
+ }
+ abort();
+}
+
+//------------------------------------------------------------------------------
+
+extern const uint8_t k4x4WidthLog2[kMaxBlockSizes];
+
+extern const uint8_t k4x4HeightLog2[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksWide[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes];
+
+extern const uint8_t kBlockWidthPixels[kMaxBlockSizes];
+
+extern const uint8_t kBlockHeightPixels[kMaxBlockSizes];
+
+extern const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes];
+
+extern const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2];
+
+extern const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1];
+
+extern const uint8_t kTransformWidth[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight[kNumTransformSizes];
+
+extern const uint8_t kTransformWidth4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformWidthLog2[kNumTransformSizes];
+
+extern const uint8_t kTransformHeightLog2[kNumTransformSizes];
+
+extern const TransformSize kSplitTransformSize[kNumTransformSizes];
+
+// Square transform of size min(w,h).
+extern const TransformSize kTransformSizeSquareMin[kNumTransformSizes];
+
+// Square transform of size max(w,h).
+extern const TransformSize kTransformSizeSquareMax[kNumTransformSizes];
+
+extern const uint8_t kNumTransformTypesInSet[kNumTransformSets];
+
+extern const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4];
+
+extern const int8_t kSgrProjMultiplierMin[2];
+
+extern const int8_t kSgrProjMultiplierMax[2];
+
+extern const int8_t kWienerTapsMin[3];
+
+extern const int8_t kWienerTapsMax[3];
+
+extern const uint8_t kUpscaleFilterUnsigned[kSuperResFilterShifts]
+ [kSuperResFilterTaps];
+
+// An int8_t version of the kWarpedFilters array.
+// Note: The array could be removed with a performance penalty.
+extern const int8_t kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int16_t kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int8_t kHalfSubPixelFilters[6][16][8];
+
+extern const uint8_t kAbsHalfSubPixelFilters[6][16][8];
+
+extern const int16_t kDirectionalIntraPredictorDerivative[44];
+
+extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_CONSTANTS_H_
diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc
new file mode 100644
index 0000000..a6b7057
--- /dev/null
+++ b/src/utils/cpu.cc
@@ -0,0 +1,84 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/cpu.h"
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#include <cpuid.h>
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include <immintrin.h> // _xgetbv
+#include <intrin.h>
+#endif
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+ defined(_M_X64)
+namespace {
+
+#if defined(__GNUC__)
+void CpuId(int leaf, uint32_t info[4]) {
+ __cpuid_count(leaf, 0 /*ecx=subleaf*/, info[0], info[1], info[2], info[3]);
+}
+
+uint64_t Xgetbv() {
+ const uint32_t ecx = 0; // ecx specifies the extended control register
+ uint32_t eax;
+ uint32_t edx;
+ __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
+ return (static_cast<uint64_t>(edx) << 32) | eax;
+}
+#else // _MSC_VER
+void CpuId(int leaf, uint32_t info[4]) {
+ __cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
+}
+
+uint64_t Xgetbv() { return _xgetbv(0); }
+#endif // __GNUC__
+
+} // namespace
+
+uint32_t GetCpuInfo() {
+ uint32_t info[4];
+
+ // Get the highest feature value cpuid supports
+ CpuId(0, info);
+ const int max_cpuid_value = info[0];
+ if (max_cpuid_value < 1) return 0;
+
+ CpuId(1, info);
+ uint32_t features = 0;
+ if ((info[3] & (1 << 26)) != 0) features |= kSSE2;
+ if ((info[2] & (1 << 9)) != 0) features |= kSSSE3;
+ if ((info[2] & (1 << 19)) != 0) features |= kSSE4_1;
+
+ // Bits 27 (OSXSAVE) & 28 (256-bit AVX)
+ if ((info[2] & (3 << 27)) == (3 << 27)) {
+ // XMM state and YMM state enabled by the OS
+ if ((Xgetbv() & 0x6) == 0x6) {
+ features |= kAVX;
+ if (max_cpuid_value >= 7) {
+ CpuId(7, info);
+ if ((info[1] & (1 << 5)) != 0) features |= kAVX2;
+ }
+ }
+ }
+
+ return features;
+}
+#else
+uint32_t GetCpuInfo() { return 0; }
+#endif // x86 || x86_64
+
+} // namespace libgav1
diff --git a/src/utils/cpu.h b/src/utils/cpu.h
new file mode 100644
index 0000000..630b251
--- /dev/null
+++ b/src/utils/cpu.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CPU_H_
+#define LIBGAV1_SRC_UTILS_CPU_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__)
+#define LIBGAV1_X86
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#define LIBGAV1_X86
+#define LIBGAV1_X86_MSVC
+#endif
+
+#if defined(LIBGAV1_X86)
+
+#if !defined(LIBGAV1_ENABLE_SSE4_1)
+#define LIBGAV1_ENABLE_SSE4_1 1
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+#if !defined(LIBGAV1_ENABLE_AVX2)
+#define LIBGAV1_ENABLE_AVX2 1
+#endif // !defined(LIBGAV1_ENABLE_AVX2)
+#else // !LIBGAV1_ENABLE_SSE4_1
+// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#endif // LIBGAV1_ENABLE_SSE4_1
+
+#else // !LIBGAV1_X86
+
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#undef LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_ENABLE_SSE4_1 0
+
+#endif // LIBGAV1_X86
+
+// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting
+// (at least) that instruction set. This prevents disabling other instruction
+// sets if the current instruction set isn't a global target, e.g., building
+// *_avx2.cc w/-mavx2, but the remaining files without the flag.
+#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__)
+#define LIBGAV1_TARGETING_AVX2 1
+#else
+#define LIBGAV1_TARGETING_AVX2 0
+#endif
+
+// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there
+// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be
+// enabled in dsp.h to compensate for this.
+#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC))
+#define LIBGAV1_TARGETING_SSE4_1 1
+#else
+#define LIBGAV1_TARGETING_SSE4_1 0
+#endif
+
+#undef LIBGAV1_X86
+
+#if !defined(LIBGAV1_ENABLE_NEON)
+// TODO(jzern): add support for _M_ARM64.
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+ (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENABLE_NEON 0
+#endif
+#endif // !defined(LIBGAV1_ENABLE_NEON)
+
+enum CpuFeatures : uint8_t {
+ kSSE2 = 1 << 0,
+#define LIBGAV1_CPU_SSE2 (1 << 0)
+ kSSSE3 = 1 << 1,
+#define LIBGAV1_CPU_SSSE3 (1 << 1)
+ kSSE4_1 = 1 << 2,
+#define LIBGAV1_CPU_SSE4_1 (1 << 2)
+ kAVX = 1 << 3,
+#define LIBGAV1_CPU_AVX (1 << 3)
+ kAVX2 = 1 << 4,
+#define LIBGAV1_CPU_AVX2 (1 << 4)
+ kNEON = 1 << 5,
+#define LIBGAV1_CPU_NEON (1 << 5)
+};
+
+// Returns a bit-wise OR of CpuFeatures supported by this platform.
+uint32_t GetCpuInfo();
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_CPU_H_
diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h
new file mode 100644
index 0000000..b51345a
--- /dev/null
+++ b/src/utils/dynamic_buffer.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+#define LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+
+#include <memory>
+#include <new>
+
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+template <typename T>
+class DynamicBuffer {
+ public:
+ T* get() { return buffer_.get(); }
+ const T* get() const { return buffer_.get(); }
+
+ // Resizes the buffer so that it can hold at least |size| elements. Existing
+ // contents will be destroyed when resizing to a larger size.
+ //
+ // Returns true on success. If Resize() returns false, then subsequent calls
+ // to get() will return nullptr.
+ bool Resize(size_t size) {
+ if (size <= size_) return true;
+ buffer_.reset(new (std::nothrow) T[size]);
+ if (buffer_ == nullptr) {
+ size_ = 0;
+ return false;
+ }
+ size_ = size;
+ return true;
+ }
+
+ private:
+ std::unique_ptr<T[]> buffer_;
+ size_t size_ = 0;
+};
+
+template <typename T, int alignment>
+class AlignedDynamicBuffer {
+ public:
+ T* get() { return buffer_.get(); }
+
+ // Resizes the buffer so that it can hold at least |size| elements. Existing
+ // contents will be destroyed when resizing to a larger size.
+ //
+ // Returns true on success. If Resize() returns false, then subsequent calls
+ // to get() will return nullptr.
+ bool Resize(size_t size) {
+ if (size <= size_) return true;
+ buffer_ = MakeAlignedUniquePtr<T>(alignment, size);
+ if (buffer_ == nullptr) {
+ size_ = 0;
+ return false;
+ }
+ size_ = size;
+ return true;
+ }
+
+ private:
+ AlignedUniquePtr<T> buffer_;
+ size_t size_ = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
diff --git a/src/utils/entropy_decoder.cc b/src/utils/entropy_decoder.cc
new file mode 100644
index 0000000..bf21199
--- /dev/null
+++ b/src/utils/entropy_decoder.cc
@@ -0,0 +1,1117 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+ (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#include <emmintrin.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr uint32_t kReadBitMask = ~255;
+constexpr int kCdfPrecision = 6;
+constexpr int kMinimumProbabilityPerSymbol = 4;
+
+// This function computes the "cur" variable as specified inside the do-while
+// loop in Section 8.2.6 of the spec. This function is monotonically
+// decreasing as the values of index increases (note that the |cdf| array is
+// sorted in decreasing order).
+uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
+ int index, int symbol_count) {
+ return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
+ (kMinimumProbabilityPerSymbol * (symbol_count - index));
+}
+
+void UpdateCdf(uint16_t* const cdf, const int symbol_count, const int symbol) {
+ const uint16_t count = cdf[symbol_count];
+ // rate is computed in the spec as:
+ // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+ // In this case cdf[N] is |count|.
+ // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
+ // symbol_count > 3. So the equation becomes:
+ // 4 + (count > 15) + (count > 31) + (symbol_count > 3).
+ // Note that the largest value for count is 32 (it is not incremented beyond
+ // 32). So using that information:
+ // count >> 4 is 0 for count from 0 to 15.
+ // count >> 4 is 1 for count from 16 to 31.
+ // count >> 4 is 2 for count == 31.
+ // Now, the equation becomes:
+ // 4 + (count >> 4) + (symbol_count > 3).
+ // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
+ // with bitwise or:
+ // (4 | (count >> 4)) + (symbol_count > 3).
+ // but using addition will allow the compiler to eliminate an operation when
+ // symbol_count is known and this function is inlined.
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
+ // Hints for further optimizations:
+ //
+ // 1. clang can vectorize this for loop with width 4, even though the loop
+ // contains an if-else statement. Therefore, it may be advantageous to use
+ // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
+ // (a multiple of 4 that's not too small).
+ //
+ // 2. The for loop can be rewritten in the following form, which would enable
+ // clang to vectorize the loop with width 8:
+ //
+ // const int rounding = (1 << rate) - 1;
+ // for (int i = 0; i < symbol_count - 1; ++i) {
+ // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+ // cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+ // }
+ //
+ // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
+ // integer arithmetic. The result of the unsigned subtraction is cast to a
+ // signed integer and right-shifted. This requires the right shift of a
+ // signed integer be an arithmetic shift, which is true for clang, gcc, and
+ // Visual C++.
+ assert(symbol_count - 1 > 0);
+ int i = 0;
+ do {
+ if (i < symbol) {
+ cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+ } else {
+ cdf[i] -= cdf[i] >> rate;
+ }
+ } while (++i < symbol_count - 1);
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+}
+
+// Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
+// of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
+// SIMD instruction sets if available.
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+// The UpdateCdf() method contains the following for loop:
+//
+// for (int i = 0; i < symbol_count - 1; ++i) {
+// if (i < symbol) {
+// cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+// } else {
+// cdf[i] -= cdf[i] >> rate;
+// }
+// }
+//
+// It can be rewritten in the following two forms, which are amenable to SIMD
+// implementations:
+//
+// const int rounding = (1 << rate) - 1;
+// for (int i = 0; i < symbol_count - 1; ++i) {
+// const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+// cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+// }
+//
+// or:
+//
+// const int rounding = (1 << rate) - 1;
+// for (int i = 0; i < symbol_count - 1; ++i) {
+// const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
+// cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
+// }
+//
+// The following ARM NEON implementations use a modified version of the first
+// form, using the comparison mask and unsigned rollover to avoid the need to
+// calculate rounding.
+//
+// The cdf array has symbol_count + 1 elements. The first symbol_count elements
+// are the CDF. The last element is a count that is initialized to 0 and may
+// grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
+// cdf[symbol_count - 1] is always 0, the for loop does not update
+// cdf[symbol_count - 1]. However, it would be correct to have the for loop
+// update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
+// for loop would take the else branch when i is symbol_count - 1:
+// cdf[i] -= cdf[i] >> rate;
+// Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
+// after the update. The ARM NEON implementations take advantage of this in the
+// following two cases:
+// 1. When symbol_count is 8 or 16, the vectorized code updates the first
+// symbol_count elements in the array.
+// 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
+// the cdf array. Since an invalid CDF value is written into cdf[7], the
+// count in cdf[7] needs to be fixed up after the vectorized code.
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+ uint16x4_t cdf_vec = vld1_u16(cdf);
+ const uint16_t count = cdf[5];
+ const int rate = (count >> 4) + 5;
+ const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+ const uint16x4_t index = vcreate_u16(0x0003000200010000);
+ const uint16x4_t symbol_vec = vdup_n_u16(symbol);
+ const uint16x4_t mask = vcge_u16(index, symbol_vec);
+ // i < symbol: 32768, i >= symbol: 65535.
+ const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
+ // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+ const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
+ // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+ const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
+ const int16x4_t negative_rate = vdup_n_s16(-rate);
+ // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+ const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+ // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+ // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+ cdf_vec = vadd_u16(cdf_offset, delta);
+ vst1_u16(cdf, cdf_vec);
+ cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
+ static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+ uint16x8_t cdf_vec = vld1q_u16(cdf);
+ const uint16_t count = cdf[symbol_count];
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+ const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004));
+ const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+ const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+ const uint16x8_t delta =
+ vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf, cdf_vec);
+ cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+ uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
+ const uint16_t count = cdf[11];
+ cdf[11] = count + static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 5;
+ if (symbol > 1) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+ const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+ const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
+ vcreate_u16(0x0009000800070006));
+ const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+ const uint16x8_t delta =
+ vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf + 2, cdf_vec);
+ } else {
+ if (symbol != 0) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ } else {
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ }
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+ const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
+ cdf_vec = vsubq_u16(cdf_vec, delta);
+ vst1q_u16(cdf + 2, cdf_vec);
+ }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+ uint16x8_t cdf_vec0 = vld1q_u16(cdf);
+ uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
+ const uint16_t count = cdf[13];
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+ const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+ uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004));
+ uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
+ uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
+ uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec0 = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf, cdf_vec0);
+
+ index = vcombine_u16(vcreate_u16(0x0007000600050004),
+ vcreate_u16(0x000b000a00090008));
+ mask = vcgeq_u16(index, symbol_vec);
+ a = vorrq_u16(mask, cdf_max_probability);
+ diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
+ cdf_offset = vsubq_u16(cdf_vec1, mask);
+ delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec1 = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf + 4, cdf_vec1);
+
+ cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+ uint16x8_t cdf_vec = vld1q_u16(cdf);
+ const uint16_t count = cdf[16];
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+ const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+ uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004));
+ uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+ uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf, cdf_vec);
+
+ cdf_vec = vld1q_u16(cdf + 8);
+ index = vcombine_u16(vcreate_u16(0x000b000a00090008),
+ vcreate_u16(0x000f000e000d000c));
+ mask = vcgeq_u16(index, symbol_vec);
+ a = vorrq_u16(mask, cdf_max_probability);
+ diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ cdf_offset = vsubq_u16(cdf_vec, mask);
+ delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf + 8, cdf_vec);
+
+ cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+inline __m128i LoadLo8(const void* a) {
+ return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+ return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+ _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+ _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+ __m128i cdf_vec = LoadLo8(cdf);
+ const uint16_t count = cdf[5];
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+ const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+ const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
+ // i >= symbol.
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ // i < symbol: 32768, i >= symbol: 65535.
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+ // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+ // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
+ StoreLo8(cdf, cdf_vec);
+ cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
+ static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+ __m128i cdf_vec = LoadUnaligned16(cdf);
+ const uint16_t count = cdf[symbol_count];
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i index =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
+ StoreUnaligned16(cdf, cdf_vec);
+ cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+ __m128i cdf_vec = LoadUnaligned16(cdf + 2);
+ const uint16_t count = cdf[11];
+ cdf[11] = count + static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 5;
+ if (symbol > 1) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ const __m128i cdf_max_probability =
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i index =
+ _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
+ StoreUnaligned16(cdf + 2, cdf_vec);
+ } else {
+ if (symbol != 0) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ } else {
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ }
+ const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+ StoreUnaligned16(cdf + 2, cdf_vec);
+ }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+ __m128i cdf_vec0 = LoadLo8(cdf);
+ __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
+ const uint16_t count = cdf[13];
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+ const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+ StoreLo8(cdf, cdf_vec0);
+
+ const __m128i index1 =
+ _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
+ const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+ const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+ const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+ const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+ const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+ cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+ StoreUnaligned16(cdf + 4, cdf_vec1);
+
+ cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+ __m128i cdf_vec0 = LoadUnaligned16(cdf);
+ const uint16_t count = cdf[16];
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+ const __m128i index =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+ StoreUnaligned16(cdf, cdf_vec0);
+
+ __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
+ const __m128i index1 =
+ _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
+ const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+ const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+ const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+ const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+ const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+ cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+ StoreUnaligned16(cdf + 8, cdf_vec1);
+
+ cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 5, symbol);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 7, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 8, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 9, symbol);
+}
+
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 11, symbol);
+}
+
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 13, symbol);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 16, symbol);
+}
+
+#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+inline DaalaBitReader::WindowSize HostToBigEndian(
+ const DaalaBitReader::WindowSize x) {
+ static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
+#if defined(__GNUC__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
+#else
+ return x;
+#endif
+#elif defined(_WIN32)
+ // Note Windows targets are assumed to be little endian.
+ return static_cast<DaalaBitReader::WindowSize>(
+ (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
+ : _byteswap_ulong(static_cast<unsigned long>(x)));
+#else
+#error Unknown compiler!
+#endif // defined(__GNUC__)
+}
+
+} // namespace
+
+#if !LIBGAV1_CXX17
+constexpr int DaalaBitReader::kWindowSize; // static.
+#endif
+
+DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size,
+ bool allow_update_cdf)
+ : data_(data),
+ data_end_(data + size),
+ data_memcpy_end_((size >= sizeof(WindowSize))
+ ? data + size - sizeof(WindowSize) + 1
+ : data),
+ allow_update_cdf_(allow_update_cdf),
+ values_in_range_(kCdfMaxProbability) {
+ if (data_ < data_memcpy_end_) {
+ // This is a simplified version of PopulateBits() which loads 8 extra bits
+ // and skips the unnecessary shifts of value and window_diff_.
+ WindowSize value;
+ memcpy(&value, data_, sizeof(value));
+ data_ += sizeof(value);
+ window_diff_ = HostToBigEndian(value) ^ -1;
+ // Note the initial value of bits_ is larger than kMaxCachedBits as it's
+ // used to restore the most significant 0 bit that would be present after
+ // PopulateBits() when we extract the first symbol value.
+ // As shown in Section 8.2.2 Initialization process for symbol decoder,
+ // which uses a fixed offset to read the symbol values, the most
+ // significant bit is always 0:
+ // The variable numBits is set equal to Min( sz * 8, 15).
+ // The variable buf is read using the f(numBits) parsing process.
+ // The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
+ // The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
+ bits_ = kWindowSize - 15;
+ return;
+ }
+ window_diff_ = 0;
+ bits_ = -15;
+ PopulateBits();
+}
+
+// This is similar to the ReadSymbol() implementation but it is optimized based
+// on the following facts:
+// * The probability is fixed at half. So some multiplications can be replaced
+// with bit operations.
+// * Symbol count is fixed at 2.
+int DaalaBitReader::ReadBit() {
+ const uint32_t curr =
+ ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ int bit = 1;
+ if (symbol_value >= curr) {
+ values_in_range_ -= curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ bit = 0;
+ } else {
+ values_in_range_ = curr;
+ }
+ NormalizeRange();
+ return bit;
+}
+
+int64_t DaalaBitReader::ReadLiteral(int num_bits) {
+ assert(num_bits <= 32);
+ assert(num_bits > 0);
+ uint32_t literal = 0;
+ int bit = num_bits - 1;
+ do {
+ // ARM can combine a shift operation with a constant number of bits with
+ // some other operations, such as the OR operation.
+ // Here is an ARM disassembly example:
+ // orr w1, w0, w1, lsl #1
+ // which left shifts register w1 by 1 bit and OR the shift result with
+ // register w0.
+ // The next 2 lines are equivalent to:
+ // literal |= static_cast<uint32_t>(ReadBit()) << bit;
+ literal <<= 1;
+ literal |= static_cast<uint32_t>(ReadBit());
+ } while (--bit >= 0);
+ return literal;
+}
+
+int DaalaBitReader::ReadSymbol(uint16_t* const cdf, int symbol_count) {
+ const int symbol = ReadSymbolImpl(cdf, symbol_count);
+ if (allow_update_cdf_) {
+ UpdateCdf(cdf, symbol_count, symbol);
+ }
+ return symbol;
+}
+
+bool DaalaBitReader::ReadSymbol(uint16_t* cdf) {
+ assert(cdf[1] == 0);
+ const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
+ if (allow_update_cdf_) {
+ const uint16_t count = cdf[2];
+ // rate is computed in the spec as:
+ // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+ // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
+ // 4 + (count > 15) + (count > 31)
+ // Note that the largest value for count is 32 (it is not incremented beyond
+ // 32). So using that information:
+ // count >> 4 is 0 for count from 0 to 15.
+ // count >> 4 is 1 for count from 16 to 31.
+ // count >> 4 is 2 for count == 32.
+ // Now, the equation becomes:
+ // 4 + (count >> 4).
+ // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
+ // with bitwise or. So the final equation is:
+ // 4 | (count >> 4).
+ const int rate = 4 | (count >> 4);
+ if (symbol) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ } else {
+ cdf[0] -= cdf[0] >> rate;
+ }
+ cdf[2] += static_cast<uint16_t>(count < 32);
+ }
+ return symbol;
+}
+
+bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
+ return ReadSymbolImpl(cdf) != 0;
+}
+
+template <int symbol_count>
+int DaalaBitReader::ReadSymbol(uint16_t* const cdf) {
+ static_assert(symbol_count >= 3 && symbol_count <= 16, "");
+ if (symbol_count == 3 || symbol_count == 4) {
+ return ReadSymbol3Or4(cdf, symbol_count);
+ }
+ int symbol;
+ if (symbol_count == 8) {
+ symbol = ReadSymbolImpl8(cdf);
+ } else if (symbol_count <= 13) {
+ symbol = ReadSymbolImpl(cdf, symbol_count);
+ } else {
+ symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
+ }
+ if (allow_update_cdf_) {
+ if (symbol_count == 5) {
+ UpdateCdf5(cdf, symbol);
+ } else if (symbol_count == 7) {
+ UpdateCdf7(cdf, symbol);
+ } else if (symbol_count == 8) {
+ UpdateCdf8(cdf, symbol);
+ } else if (symbol_count == 9) {
+ UpdateCdf9(cdf, symbol);
+ } else if (symbol_count == 11) {
+ UpdateCdf11(cdf, symbol);
+ } else if (symbol_count == 13) {
+ UpdateCdf13(cdf, symbol);
+ } else if (symbol_count == 16) {
+ UpdateCdf16(cdf, symbol);
+ } else {
+ UpdateCdf(cdf, symbol_count, symbol);
+ }
+ }
+ return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf,
+ int symbol_count) {
+ assert(cdf[symbol_count - 1] == 0);
+ --symbol_count;
+ uint32_t curr = values_in_range_;
+ int symbol = -1;
+ uint32_t prev;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
+ // Search through the |cdf| array to determine where the scaled cdf value and
+ // |symbol_value| cross over.
+ do {
+ prev = curr;
+ curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
+ delta;
+ delta -= kMinimumProbabilityPerSymbol;
+ } while (symbol_value < curr);
+ values_in_range_ = prev - curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ NormalizeRange();
+ return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImplBinarySearch(const uint16_t* const cdf,
+ int symbol_count) {
+ assert(cdf[symbol_count - 1] == 0);
+ assert(symbol_count > 1 && symbol_count <= 16);
+ --symbol_count;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ // Search through the |cdf| array to determine where the scaled cdf value and
+ // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
+ // search to do this. Let |symbol| be the index of the first |cdf| array
+ // entry whose scaled cdf value is less than or equal to |symbol_value|. The
+ // binary search maintains the invariant:
+ // low <= symbol <= high + 1
+ // and terminates when low == high + 1.
+ int low = 0;
+ int high = symbol_count - 1;
+ // The binary search maintains the invariants that |prev| is the scaled cdf
+ // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
+ // convention, the scaled cdf value for -1 is values_in_range_.) When the
+ // binary search terminates, |prev| is the scaled cdf value for symbol - 1
+ // and |curr| is the scaled cdf value for |symbol|.
+ uint32_t prev = values_in_range_;
+ uint32_t curr = 0;
+ const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+ do {
+ const int mid = DivideBy2(low + high);
+ const uint32_t scaled_cdf =
+ ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
+ if (symbol_value < scaled_cdf) {
+ low = mid + 1;
+ prev = scaled_cdf;
+ } else {
+ high = mid - 1;
+ curr = scaled_cdf;
+ }
+ } while (low <= high);
+ assert(low == high + 1);
+ // At this point, |low| is the symbol that has been decoded.
+ values_in_range_ = prev - curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ NormalizeRange();
+ return low;
+}
+
+int DaalaBitReader::ReadSymbolImpl(uint16_t cdf) {
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ const uint32_t curr =
+ (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
+ kMinimumProbabilityPerSymbol;
+ const int symbol = static_cast<int>(symbol_value < curr);
+ if (symbol == 1) {
+ values_in_range_ = curr;
+ } else {
+ values_in_range_ -= curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ }
+ NormalizeRange();
+ return symbol;
+}
+
+// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
+// calls inlined.
+int DaalaBitReader::ReadSymbol3Or4(uint16_t* const cdf,
+ const int symbol_count) {
+ assert(cdf[symbol_count - 1] == 0);
+ uint32_t curr = values_in_range_;
+ uint32_t prev;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
+ const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+
+ // Search through the |cdf| array to determine where the scaled cdf value and
+ // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
+ // array.
+ //
+ // The original code is:
+ //
+ // int symbol = -1;
+ // do {
+ // prev = curr;
+ // curr =
+ // ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+ // + delta;
+ // delta -= kMinimumProbabilityPerSymbol;
+ // } while (symbol_value < curr);
+ // if (allow_update_cdf_) {
+ // UpdateCdf(cdf, [3,4], symbol);
+ // }
+ //
+ // The do-while loop is unrolled with three or four iterations, and the
+ // UpdateCdf call is inlined and merged into the iterations.
+ int symbol = 0;
+ // Iteration 0.
+ prev = curr;
+ curr =
+ ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+ if (symbol_value >= curr) {
+ // symbol == 0.
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+ if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+ // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
+ // NEON code is slower. Consider using the C version if __arm__ is
+ // defined.
+ // 2. The ARM NEON code (compiled for arm64) is slightly slower on
+ // Samsung Galaxy S8+ (SM-G955FD).
+ uint16x4_t cdf_vec = vld1_u16(cdf);
+ const int16x4_t negative_rate = vdup_n_s16(-rate);
+ const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
+ cdf_vec = vsub_u16(cdf_vec, delta);
+ vst1_u16(cdf, cdf_vec);
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ __m128i cdf_vec = LoadLo8(cdf);
+ const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+ StoreLo8(cdf, cdf_vec);
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ cdf[2] -= cdf[2] >> rate;
+#endif
+ } else { // symbol_count == 3.
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ }
+ }
+ goto found;
+ }
+ ++symbol;
+ delta -= kMinimumProbabilityPerSymbol;
+ // Iteration 1.
+ prev = curr;
+ curr =
+ ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+ if (symbol_value >= curr) {
+ // symbol == 1.
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
+ }
+ goto found;
+ }
+ ++symbol;
+ if (symbol_count == 4) {
+ delta -= kMinimumProbabilityPerSymbol;
+ // Iteration 2.
+ prev = curr;
+ curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
+ delta;
+ if (symbol_value >= curr) {
+ // symbol == 2.
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
+ const uint16_t count = cdf[4];
+ cdf[4] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 5;
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ cdf[2] -= cdf[2] >> rate;
+ }
+ goto found;
+ }
+ ++symbol;
+ }
+ // |delta| is 0 for the last iteration.
+ // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
+ prev = curr;
+ // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
+ curr = 0;
+ // symbol == [2,3].
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
+ if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+ // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
+ // code is a tiny bit slower. Consider using the C version if __arm__ is
+ // defined.
+ uint16x4_t cdf_vec = vld1_u16(cdf);
+ const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+ const int16x4_t diff =
+ vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
+ const int16x4_t negative_rate = vdup_n_s16(-rate);
+ const uint16x4_t delta =
+ vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+ cdf_vec = vadd_u16(cdf_vec, delta);
+ vst1_u16(cdf, cdf_vec);
+ cdf[3] = 0;
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ __m128i cdf_vec = LoadLo8(cdf);
+ const __m128i cdf_max_probability =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+ const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_add_epi16(cdf_vec, delta);
+ StoreLo8(cdf, cdf_vec);
+ cdf[3] = 0;
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
+#endif
+ } else { // symbol_count == 3.
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ }
+ }
+found:
+ // End of unrolled do-while loop.
+
+ values_in_range_ = prev - curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ NormalizeRange();
+ return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImpl8(const uint16_t* const cdf) {
+ assert(cdf[7] == 0);
+ uint32_t curr = values_in_range_;
+ uint32_t prev;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ uint32_t delta = kMinimumProbabilityPerSymbol * 7;
+ // Search through the |cdf| array to determine where the scaled cdf value and
+ // |symbol_value| cross over.
+ //
+ // The original code is:
+ //
+ // int symbol = -1;
+ // do {
+ // prev = curr;
+ // curr =
+ // (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+ // + delta;
+ // delta -= kMinimumProbabilityPerSymbol;
+ // } while (symbol_value < curr);
+ //
+ // The do-while loop is unrolled with eight iterations.
+ int symbol = 0;
+
+#define READ_SYMBOL_ITERATION \
+ prev = curr; \
+ curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
+ delta; \
+ if (symbol_value >= curr) goto found; \
+ ++symbol; \
+ delta -= kMinimumProbabilityPerSymbol
+
+ READ_SYMBOL_ITERATION; // Iteration 0.
+ READ_SYMBOL_ITERATION; // Iteration 1.
+ READ_SYMBOL_ITERATION; // Iteration 2.
+ READ_SYMBOL_ITERATION; // Iteration 3.
+ READ_SYMBOL_ITERATION; // Iteration 4.
+ READ_SYMBOL_ITERATION; // Iteration 5.
+
+ // The last two iterations can be simplified, so they don't use the
+ // READ_SYMBOL_ITERATION macro.
+#undef READ_SYMBOL_ITERATION
+
+ // Iteration 6.
+ prev = curr;
+ curr =
+ (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+ if (symbol_value >= curr) goto found; // symbol == 6.
+ ++symbol;
+ // |delta| is 0 for the last iteration.
+ // Iteration 7.
+ prev = curr;
+ // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
+ curr = 0;
+ // symbol == 7.
+found:
+ // End of unrolled do-while loop.
+
+ values_in_range_ = prev - curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ NormalizeRange();
+ return symbol;
+}
+
+void DaalaBitReader::PopulateBits() {
+ constexpr int kMaxCachedBits = kWindowSize - 16;
+#if defined(__aarch64__)
+ // Fast path: read eight bytes and add the first six bytes to window_diff_.
+ // This fast path makes the following assumptions.
+ // 1. We assume that unaligned load of uint64_t is fast.
+ // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
+ // bytes depending on the value of bits_. This fast path always reads 6
+ // bytes, which results in more calls to PopulateBits(). We assume that
+ // making more calls to a faster PopulateBits() is overall a win.
+ // NOTE: Although this fast path could also be used on x86_64, it hurts
+ // performance (measured on Lenovo ThinkStation P920 running Linux). (The
+ // reason is still unknown.) Therefore this fast path is only used on arm64.
+ static_assert(kWindowSize == 64, "");
+ if (data_ < data_memcpy_end_) {
+ uint64_t value;
+ // arm64 supports unaligned loads, so this memcpy call is compiled to a
+ // single ldr instruction.
+ memcpy(&value, data_, sizeof(value));
+ data_ += kMaxCachedBits >> 3;
+ value = HostToBigEndian(value) ^ -1;
+ value >>= kWindowSize - kMaxCachedBits;
+ window_diff_ = value | (window_diff_ << kMaxCachedBits);
+ bits_ += kMaxCachedBits;
+ return;
+ }
+#endif
+
+ const uint8_t* data = data_;
+ int bits = bits_;
+ WindowSize window_diff = window_diff_;
+
+ int count = kWindowSize - 9 - (bits + 15);
+ // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
+ // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
+ // iterations when WindowSize is 64 bits. So it is not profitable to
+ // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
+ // the fast path above is not compiled.
+
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable)
+#endif
+ for (; count >= 0 && data < data_end_; count -= 8) {
+ const uint8_t value = *data++ ^ -1;
+ window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
+ bits += 8;
+ }
+ assert(bits <= kMaxCachedBits);
+ if (data == data_end_) {
+ // Shift in some 1s. This is equivalent to providing fake 0 data bits.
+ window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
+ bits = kMaxCachedBits;
+ }
+
+ data_ = data;
+ bits_ = bits;
+ window_diff_ = window_diff;
+}
+
+void DaalaBitReader::NormalizeRange() {
+ const int bits_used = 15 ^ FloorLog2(values_in_range_);
+ bits_ -= bits_used;
+ values_in_range_ <<= bits_used;
+ if (bits_ < 0) PopulateBits();
+}
+
+// Explicit instantiations.
+template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
+
+} // namespace libgav1
diff --git a/src/utils/entropy_decoder.h b/src/utils/entropy_decoder.h
new file mode 100644
index 0000000..c066b98
--- /dev/null
+++ b/src/utils/entropy_decoder.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+#define LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+class DaalaBitReader : public BitReader {
+ public:
+ // WindowSize must be an unsigned integer type with at least 32 bits. Use the
+ // largest type with fast arithmetic. size_t should meet these requirements.
+ using WindowSize = size_t;
+
+ DaalaBitReader(const uint8_t* data, size_t size, bool allow_update_cdf);
+ ~DaalaBitReader() override = default;
+
+ // Move only.
+ DaalaBitReader(DaalaBitReader&& rhs) noexcept;
+ DaalaBitReader& operator=(DaalaBitReader&& rhs) noexcept;
+
+ int ReadBit() final;
+ int64_t ReadLiteral(int num_bits) override;
+ // ReadSymbol() calls for which the |symbol_count| is only known at runtime
+ // will use this variant.
+ int ReadSymbol(uint16_t* cdf, int symbol_count);
+ // ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean
+ // symbols) will use this variant.
+ bool ReadSymbol(uint16_t* cdf);
+ bool ReadSymbolWithoutCdfUpdate(uint16_t cdf);
+ // Use either linear search or binary search for decoding the symbol depending
+ // on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known
+ // at compile time will use this variant.
+ template <int symbol_count>
+ int ReadSymbol(uint16_t* cdf);
+
+ private:
+ static constexpr int kWindowSize = static_cast<int>(sizeof(WindowSize)) * 8;
+ static_assert(kWindowSize >= 32, "");
+
+ // Reads a symbol using the |cdf| table which contains the probabilities of
+ // each symbol. On a high level, this function does the following:
+ // 1) Scale the |cdf| values.
+ // 2) Find the index in the |cdf| array where the scaled CDF value crosses
+ // the modified |window_diff_| threshold.
+ // 3) That index is the symbol that has been decoded.
+ // 4) Update |window_diff_| and |values_in_range_| based on the symbol that
+ // has been decoded.
+ inline int ReadSymbolImpl(const uint16_t* cdf, int symbol_count);
+ // Similar to ReadSymbolImpl but it uses binary search to perform step 2 in
+ // the comment above. As of now, this function is called when |symbol_count|
+ // is greater than or equal to 14.
+ inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count);
+ // Specialized implementation of ReadSymbolImpl based on the fact that
+ // symbol_count == 2.
+ inline int ReadSymbolImpl(uint16_t cdf);
+ // ReadSymbolN is a specialization of ReadSymbol for symbol_count == N.
+ LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count);
+ // ReadSymbolImplN is a specialization of ReadSymbolImpl for
+ // symbol_count == N.
+ LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf);
+ inline void PopulateBits();
+ // Normalizes the range so that 32768 <= |values_in_range_| < 65536. Also
+ // calls PopulateBits() if necessary.
+ inline void NormalizeRange();
+
+ const uint8_t* data_;
+ const uint8_t* const data_end_;
+ // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes
+ // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the
+ // constructor, not PopulateBits().
+ const uint8_t* const data_memcpy_end_;
+ const bool allow_update_cdf_;
+ // Number of cached bits of data in the current value.
+ int bits_;
+ // Number of values in the current range. Declared as uint32_t for better
+ // performance but only the lower 16 bits are used.
+ uint32_t values_in_range_;
+ // The difference between the high end of the current range and the coded
+ // value minus 1. The 16 bits above |bits_| of this variable are used to
+ // decode the next symbol. It is filled in whenever |bits_| is less than 0.
+ // Note this implementation differs from the spec as it trades the need to
+ // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(),
+ // which occurs less frequently.
+ WindowSize window_diff_;
+};
+
+extern template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
diff --git a/src/utils/executor.cc b/src/utils/executor.cc
new file mode 100644
index 0000000..6934057
--- /dev/null
+++ b/src/utils/executor.cc
@@ -0,0 +1,21 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/executor.h"
+
+namespace libgav1 {
+
+Executor::~Executor() = default;
+
+} // namespace libgav1
diff --git a/src/utils/executor.h b/src/utils/executor.h
new file mode 100644
index 0000000..21abdf8
--- /dev/null
+++ b/src/utils/executor.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_EXECUTOR_H_
+#define LIBGAV1_SRC_UTILS_EXECUTOR_H_
+
+#include <functional>
+
+namespace libgav1 {
+
+class Executor {
+ public:
+ virtual ~Executor();
+
+ // Schedules the specified "callback" for execution in this executor.
+ // Depending on the subclass implementation, this may block in some
+ // situations.
+ virtual void Schedule(std::function<void()> callback) = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_EXECUTOR_H_
diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake
new file mode 100644
index 0000000..8b6ec4b
--- /dev/null
+++ b/src/utils/libgav1_utils.cmake
@@ -0,0 +1,72 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_)
+ return()
+endif() # LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_
+set(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ 1)
+
+list(APPEND libgav1_utils_sources
+ "${libgav1_source}/utils/array_2d.h"
+ "${libgav1_source}/utils/bit_mask_set.h"
+ "${libgav1_source}/utils/bit_reader.cc"
+ "${libgav1_source}/utils/bit_reader.h"
+ "${libgav1_source}/utils/block_parameters_holder.cc"
+ "${libgav1_source}/utils/block_parameters_holder.h"
+ "${libgav1_source}/utils/blocking_counter.h"
+ "${libgav1_source}/utils/common.h"
+ "${libgav1_source}/utils/compiler_attributes.h"
+ "${libgav1_source}/utils/constants.cc"
+ "${libgav1_source}/utils/constants.h"
+ "${libgav1_source}/utils/cpu.cc"
+ "${libgav1_source}/utils/cpu.h"
+ "${libgav1_source}/utils/dynamic_buffer.h"
+ "${libgav1_source}/utils/entropy_decoder.cc"
+ "${libgav1_source}/utils/entropy_decoder.h"
+ "${libgav1_source}/utils/executor.cc"
+ "${libgav1_source}/utils/executor.h"
+ "${libgav1_source}/utils/logging.cc"
+ "${libgav1_source}/utils/logging.h"
+ "${libgav1_source}/utils/memory.h"
+ "${libgav1_source}/utils/parameter_tree.cc"
+ "${libgav1_source}/utils/parameter_tree.h"
+ "${libgav1_source}/utils/queue.h"
+ "${libgav1_source}/utils/raw_bit_reader.cc"
+ "${libgav1_source}/utils/raw_bit_reader.h"
+ "${libgav1_source}/utils/reference_info.h"
+ "${libgav1_source}/utils/segmentation.cc"
+ "${libgav1_source}/utils/segmentation.h"
+ "${libgav1_source}/utils/segmentation_map.cc"
+ "${libgav1_source}/utils/segmentation_map.h"
+ "${libgav1_source}/utils/stack.h"
+ "${libgav1_source}/utils/threadpool.cc"
+ "${libgav1_source}/utils/threadpool.h"
+ "${libgav1_source}/utils/types.h"
+ "${libgav1_source}/utils/unbounded_queue.h"
+ "${libgav1_source}/utils/vector.h")
+
+macro(libgav1_add_utils_targets)
+ libgav1_add_library(NAME
+ libgav1_utils
+ TYPE
+ OBJECT
+ SOURCES
+ ${libgav1_utils_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ ${libgav1_gtest_include_paths})
+
+endmacro()
diff --git a/src/utils/logging.cc b/src/utils/logging.cc
new file mode 100644
index 0000000..9a43c22
--- /dev/null
+++ b/src/utils/logging.cc
@@ -0,0 +1,65 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/logging.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <sstream>
+#include <thread> // NOLINT (unapproved c++11 header)
+
+#if !defined(LIBGAV1_LOG_LEVEL)
+#define LIBGAV1_LOG_LEVEL (1 << 30)
+#endif
+
+namespace libgav1 {
+namespace internal {
+#if LIBGAV1_ENABLE_LOGGING
+namespace {
+
+const char* LogSeverityName(LogSeverity severity) {
+ switch (severity) {
+ case LogSeverity::kInfo:
+ return "INFO";
+ case LogSeverity::kError:
+ return "ERROR";
+ case LogSeverity::kWarning:
+ return "WARNING";
+ }
+ return "UNKNOWN";
+}
+
+} // namespace
+
+void Log(LogSeverity severity, const char* file, int line, const char* format,
+ ...) {
+ if (LIBGAV1_LOG_LEVEL < static_cast<int>(severity)) return;
+ std::ostringstream ss;
+ ss << std::hex << std::this_thread::get_id();
+ fprintf(stderr, "%s %s %s:%d] ", LogSeverityName(severity), ss.str().c_str(),
+ file, line);
+
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ fprintf(stderr, "\n");
+}
+#else // !LIBGAV1_ENABLE_LOGGING
+void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
+ const char* /*format*/, ...) {}
+#endif // LIBGAV1_ENABLE_LOGGING
+
+} // namespace internal
+} // namespace libgav1
diff --git a/src/utils/logging.h b/src/utils/logging.h
new file mode 100644
index 0000000..48928db
--- /dev/null
+++ b/src/utils/logging.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_LOGGING_H_
+#define LIBGAV1_SRC_UTILS_LOGGING_H_
+
+#include <cstddef>
+
+#include "src/utils/compiler_attributes.h"
+
+#if !defined(LIBGAV1_ENABLE_LOGGING)
+#if defined(NDEBUG) || defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_ENABLE_LOGGING
+// LIBGAV1_DLOG(severity, printf-format-string)
+// Debug logging that can optionally be enabled in release builds by explicitly
+// setting LIBGAV1_ENABLE_LOGGING.
+// Severity is given as an all-caps version of enum LogSeverity with the
+// leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
+#define LIBGAV1_DLOG(severity, ...) \
+ do { \
+ constexpr const char* libgav1_logging_internal_basename = \
+ ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \
+ libgav1_logging_internal_basename, __LINE__, \
+ __VA_ARGS__); \
+ } while (0)
+#else
+#define LIBGAV1_DLOG(severity, ...) \
+ do { \
+ } while (0)
+#endif // LIBGAV1_ENABLE_LOGGING
+
+#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_WARNING \
+ ::libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo
+
+namespace libgav1 {
+namespace internal {
+
+enum class LogSeverity : int {
+ kError,
+ kWarning,
+ kInfo,
+};
+
+// Helper function to implement LIBGAV1_DLOG
+// Logs |format, ...| at |severity| level, reporting it as called from
+// |file|:|line|.
+void Log(libgav1::internal::LogSeverity severity, const char* file, int line,
+ const char* format, ...) LIBGAV1_PRINTF_ATTRIBUTE(4, 5);
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+ return (offset == 0 || file_name[offset - 1] == '/' ||
+ file_name[offset - 1] == '\\')
+ ? file_name + offset
+ : Basename(file_name, offset - 1);
+}
+
+} // namespace internal
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_LOGGING_H_
diff --git a/src/utils/memory.h b/src/utils/memory.h
new file mode 100644
index 0000000..219a83f
--- /dev/null
+++ b/src/utils/memory.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_MEMORY_H_
+#define LIBGAV1_SRC_UTILS_MEMORY_H_
+
+#if defined(__ANDROID__) || defined(_MSC_VER)
+#include <malloc.h>
+#endif
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+
+namespace libgav1 {
+
+enum {
+// The byte alignment required for buffers used with SIMD code to be read or
+// written with aligned operations.
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+ defined(_M_X64)
+ kMaxAlignment = 32, // extended alignment is safe on x86.
+#else
+ kMaxAlignment = alignof(max_align_t),
+#endif
+};
+
+// AlignedAlloc, AlignedFree
+//
+// void* AlignedAlloc(size_t alignment, size_t size);
+// Allocate aligned memory.
+// |alignment| must be a power of 2.
+// Unlike posix_memalign(), |alignment| may be smaller than sizeof(void*).
+// Unlike aligned_alloc(), |size| does not need to be a multiple of
+// |alignment|.
+// The returned pointer should be freed by AlignedFree().
+//
+// void AlignedFree(void* aligned_memory);
+// Free aligned memory.
+
+#if defined(_MSC_VER) // MSVC
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+ return _aligned_malloc(size, alignment);
+}
+
+inline void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+
+#else // !defined(_MSC_VER)
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+#if defined(__ANDROID__)
+ // Although posix_memalign() was introduced in Android API level 17, it is
+ // more convenient to use memalign(). Unlike glibc, Android does not consider
+ // memalign() an obsolete function.
+ return memalign(alignment, size);
+#else // !defined(__ANDROID__)
+ void* ptr = nullptr;
+ // posix_memalign requires that the requested alignment be at least
+ // sizeof(void*). In this case, fall back on malloc which should return
+ // memory aligned to at least the size of a pointer.
+ const size_t required_alignment = sizeof(void*);
+ if (alignment < required_alignment) return malloc(size);
+ const int error = posix_memalign(&ptr, alignment, size);
+ if (error != 0) {
+ errno = error;
+ return nullptr;
+ }
+ return ptr;
+#endif // defined(__ANDROID__)
+}
+
+inline void AlignedFree(void* aligned_memory) { free(aligned_memory); }
+
+#endif // defined(_MSC_VER)
+
+inline void Memset(uint8_t* const dst, int value, size_t count) {
+ memset(dst, value, count);
+}
+
+inline void Memset(uint16_t* const dst, int value, size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ dst[i] = static_cast<uint16_t>(value);
+ }
+}
+
+struct MallocDeleter {
+ void operator()(void* ptr) const { free(ptr); }
+};
+
+struct AlignedDeleter {
+ void operator()(void* ptr) const { AlignedFree(ptr); }
+};
+
+template <typename T>
+using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
+
+// Allocates aligned memory for an array of |count| elements of type T.
+template <typename T>
+inline AlignedUniquePtr<T> MakeAlignedUniquePtr(size_t alignment,
+ size_t count) {
+ return AlignedUniquePtr<T>(
+ static_cast<T*>(AlignedAlloc(alignment, count * sizeof(T))));
+}
+
+// A base class with custom new and delete operators. The exception-throwing
+// new operators are deleted. The "new (std::nothrow)" form must be used.
+//
+// The new operators return nullptr if the requested size is greater than
+// 0x40000000 bytes (1 GB). TODO(wtc): Make the maximum allocable memory size
+// a compile-time configuration macro.
+//
+// See https://en.cppreference.com/w/cpp/memory/new/operator_new and
+// https://en.cppreference.com/w/cpp/memory/new/operator_delete.
+//
+// NOTE: The allocation and deallocation functions are static member functions
+// whether the keyword 'static' is used or not.
+struct Allocable {
+ // Class-specific allocation functions.
+ static void* operator new(size_t size) = delete;
+ static void* operator new[](size_t size) = delete;
+
+ // Class-specific non-throwing allocation functions
+ static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+ if (size > 0x40000000) return nullptr;
+ return ::operator new(size, tag);
+ }
+ static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+ if (size > 0x40000000) return nullptr;
+ return ::operator new[](size, tag);
+ }
+
+ // Class-specific deallocation functions.
+ static void operator delete(void* ptr) noexcept { ::operator delete(ptr); }
+ static void operator delete[](void* ptr) noexcept {
+ ::operator delete[](ptr);
+ }
+
+ // Only called if new (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+ ::operator delete(ptr, tag);
+ }
+ // Only called if new[] (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+ ::operator delete[](ptr, tag);
+ }
+};
+
+// A variant of Allocable that forces allocations to be aligned to
+// kMaxAlignment bytes. This is intended for use with classes that use
+// alignas() with this value. C++17 aligned new/delete are used if available,
+// otherwise we use AlignedAlloc/Free.
+struct MaxAlignedAllocable {
+ // Class-specific allocation functions.
+ static void* operator new(size_t size) = delete;
+ static void* operator new[](size_t size) = delete;
+
+ // Class-specific non-throwing allocation functions
+ static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+ if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+ return ::operator new(size, std::align_val_t(kMaxAlignment), tag);
+#else
+ static_cast<void>(tag);
+ return AlignedAlloc(kMaxAlignment, size);
+#endif
+ }
+ static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+ if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+ return ::operator new[](size, std::align_val_t(kMaxAlignment), tag);
+#else
+ static_cast<void>(tag);
+ return AlignedAlloc(kMaxAlignment, size);
+#endif
+ }
+
+ // Class-specific deallocation functions.
+ static void operator delete(void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+ ::operator delete(ptr, std::align_val_t(kMaxAlignment));
+#else
+ AlignedFree(ptr);
+#endif
+ }
+ static void operator delete[](void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+ ::operator delete[](ptr, std::align_val_t(kMaxAlignment));
+#else
+ AlignedFree(ptr);
+#endif
+ }
+
+ // Only called if new (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+ ::operator delete(ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+ static_cast<void>(tag);
+ AlignedFree(ptr);
+#endif
+ }
+ // Only called if new[] (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+ ::operator delete[](ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+ static_cast<void>(tag);
+ AlignedFree(ptr);
+#endif
+ }
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_MEMORY_H_
diff --git a/src/utils/parameter_tree.cc b/src/utils/parameter_tree.cc
new file mode 100644
index 0000000..9426ce6
--- /dev/null
+++ b/src/utils/parameter_tree.cc
@@ -0,0 +1,133 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/parameter_tree.h"
+
+#include <cassert>
+#include <memory>
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// static
+std::unique_ptr<ParameterTree> ParameterTree::Create(int row4x4, int column4x4,
+ BlockSize block_size,
+ bool is_leaf) {
+ std::unique_ptr<ParameterTree> tree(
+ new (std::nothrow) ParameterTree(row4x4, column4x4, block_size));
+ if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) {
+ tree = nullptr;
+ }
+ return tree;
+}
+
+bool ParameterTree::SetPartitionType(Partition partition) {
+ assert(!partition_type_set_);
+ partition_ = partition;
+ partition_type_set_ = true;
+ const int block_width4x4 = kNum4x4BlocksWide[block_size_];
+ const int half_block4x4 = block_width4x4 >> 1;
+ const int quarter_block4x4 = half_block4x4 >> 1;
+ const BlockSize sub_size = kSubSize[partition][block_size_];
+ const BlockSize split_size = kSubSize[kPartitionSplit][block_size_];
+ assert(partition == kPartitionNone || sub_size != kBlockInvalid);
+ switch (partition) {
+ case kPartitionNone:
+ parameters_.reset(new (std::nothrow) BlockParameters());
+ return parameters_ != nullptr;
+ case kPartitionHorizontal:
+ children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
+ children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+ sub_size, true);
+ return children_[0] != nullptr && children_[1] != nullptr;
+ case kPartitionVertical:
+ children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
+ children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+ sub_size, true);
+ return children_[0] != nullptr && children_[1] != nullptr;
+ case kPartitionSplit:
+ children_[0] =
+ ParameterTree::Create(row4x4_, column4x4_, sub_size, false);
+ children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+ sub_size, false);
+ children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+ sub_size, false);
+ children_[3] = ParameterTree::Create(
+ row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false);
+ return children_[0] != nullptr && children_[1] != nullptr &&
+ children_[2] != nullptr && children_[3] != nullptr;
+ case kPartitionHorizontalWithTopSplit:
+ assert(split_size != kBlockInvalid);
+ children_[0] =
+ ParameterTree::Create(row4x4_, column4x4_, split_size, true);
+ children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+ split_size, true);
+ children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+ sub_size, true);
+ return children_[0] != nullptr && children_[1] != nullptr &&
+ children_[2] != nullptr;
+ case kPartitionHorizontalWithBottomSplit:
+ assert(split_size != kBlockInvalid);
+ children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
+ children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+ split_size, true);
+ children_[2] =
+ ParameterTree::Create(row4x4_ + half_block4x4,
+ column4x4_ + half_block4x4, split_size, true);
+ return children_[0] != nullptr && children_[1] != nullptr &&
+ children_[2] != nullptr;
+ case kPartitionVerticalWithLeftSplit:
+ assert(split_size != kBlockInvalid);
+ children_[0] =
+ ParameterTree::Create(row4x4_, column4x4_, split_size, true);
+ children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
+ split_size, true);
+ children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+ sub_size, true);
+ return children_[0] != nullptr && children_[1] != nullptr &&
+ children_[2] != nullptr;
+ case kPartitionVerticalWithRightSplit:
+ assert(split_size != kBlockInvalid);
+ children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
+ children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
+ split_size, true);
+ children_[2] =
+ ParameterTree::Create(row4x4_ + half_block4x4,
+ column4x4_ + half_block4x4, split_size, true);
+ return children_[0] != nullptr && children_[1] != nullptr &&
+ children_[2] != nullptr;
+ case kPartitionHorizontal4:
+ for (int i = 0; i < 4; ++i) {
+ children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4,
+ column4x4_, sub_size, true);
+ if (children_[i] == nullptr) return false;
+ }
+ return true;
+ default:
+ assert(partition == kPartitionVertical4);
+ for (int i = 0; i < 4; ++i) {
+ children_[i] = ParameterTree::Create(
+ row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true);
+ if (children_[i] == nullptr) return false;
+ }
+ return true;
+ }
+}
+
+} // namespace libgav1
diff --git a/src/utils/parameter_tree.h b/src/utils/parameter_tree.h
new file mode 100644
index 0000000..935f3eb
--- /dev/null
+++ b/src/utils/parameter_tree.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
+#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
+
+#include <cassert>
+#include <memory>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+class ParameterTree : public Allocable {
+ public:
+ // Creates a parameter tree to store the parameters of a block of size
+ // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf|
+ // is set to true, the memory will be allocated for the BlockParameters for
+ // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to
+ // false, |block_size| must be a square block, i.e.,
+ // kBlockWidthPixels[block_size] must be equal to
+ // kBlockHeightPixels[block_size].
+ static std::unique_ptr<ParameterTree> Create(int row4x4, int column4x4,
+ BlockSize block_size,
+ bool is_leaf = false);
+
+ // Move only (not Copyable).
+ ParameterTree(ParameterTree&& other) = default;
+ ParameterTree& operator=(ParameterTree&& other) = default;
+ ParameterTree(const ParameterTree&) = delete;
+ ParameterTree& operator=(const ParameterTree&) = delete;
+
+ // Set the partition type of the current node to |partition|.
+ // if (partition == kPartitionNone) {
+ // Memory will be allocated for the BlockParameters for this node.
+ // } else if (partition != kPartitionSplit) {
+ // The appropriate child nodes will be populated and memory will be
+ // allocated for the BlockParameters of the children.
+ // } else {
+ // The appropriate child nodes will be populated but they are considered to
+ // be hanging, i.e., future calls to SetPartitionType() on the child nodes
+ // will have to set them or their descendants to a terminal type.
+ // }
+ // This function must be called only once per node.
+ LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition);
+
+ // Basic getters.
+ int row4x4() const { return row4x4_; }
+ int column4x4() const { return column4x4_; }
+ BlockSize block_size() const { return block_size_; }
+ Partition partition() const { return partition_; }
+ ParameterTree* children(int index) const {
+ assert(index < 4);
+ return children_[index].get();
+ }
+ // Returns the BlockParameters object of the current node if one exists.
+ // Otherwise returns nullptr. This function will return a valid
+ // BlockParameters object only for leaf nodes.
+ BlockParameters* parameters() const { return parameters_.get(); }
+
+ private:
+ ParameterTree(int row4x4, int column4x4, BlockSize block_size)
+ : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {}
+
+ Partition partition_ = kPartitionNone;
+ std::unique_ptr<BlockParameters> parameters_ = nullptr;
+ int row4x4_ = -1;
+ int column4x4_ = -1;
+ BlockSize block_size_ = kBlockInvalid;
+ bool partition_type_set_ = false;
+
+ // Child values are defined as follows for various partition types:
+ // * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr;
+ // * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr;
+ // * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left
+ // partition; 3 bottom-right partition;
+ // * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2
+ // bottom partition; 3 nullptr;
+ // * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2
+ // bottom-right partition; 3 nullptr;
+ // * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2
+ // right partition; 3 nullptr;
+ // * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2
+ // bottom-right partition; 3 nullptr;
+ // * Horizontal4: 0 top partition; 1 second top partition; 2 third top
+ // partition; 3 bottom partition;
+ // * Vertical4: 0 left partition; 1 second left partition; 2 third left
+ // partition; 3 right partition;
+ std::unique_ptr<ParameterTree> children_[4] = {};
+
+ friend class ParameterTreeTest;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
diff --git a/src/utils/queue.h b/src/utils/queue.h
new file mode 100644
index 0000000..cffb9ca
--- /dev/null
+++ b/src/utils/queue.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// A FIFO queue of a fixed capacity.
+//
+// WARNING: No error checking is performed.
+template <typename T>
+class Queue {
+ public:
+ LIBGAV1_MUST_USE_RESULT bool Init(size_t capacity) {
+ elements_.reset(new (std::nothrow) T[capacity]);
+ if (elements_ == nullptr) return false;
+ capacity_ = capacity;
+ return true;
+ }
+
+ // Pushes the element |value| to the end of the queue. It is an error to call
+ // Push() when the queue is full.
+ void Push(T&& value) {
+ assert(size_ < capacity_);
+ elements_[end_++] = std::move(value);
+ if (end_ == capacity_) end_ = 0;
+ ++size_;
+ }
+
+ // Removes the element at the front of the queue. It is an error to call Pop()
+ // when the queue is empty.
+ void Pop() {
+ assert(size_ != 0);
+ const T element = std::move(elements_[begin_++]);
+ static_cast<void>(element);
+ if (begin_ == capacity_) begin_ = 0;
+ --size_;
+ }
+
+ // Returns a reference to the element at the front of the queue. It is an
+ // error to call Front() when the queue is empty.
+ T& Front() {
+ assert(size_ != 0);
+ return elements_[begin_];
+ }
+
+ // Returns a reference to the element at the back of the queue. It is an error
+ // to call Back() when the queue is empty.
+ T& Back() {
+ assert(size_ != 0);
+ const size_t back = ((end_ == 0) ? capacity_ : end_) - 1;
+ return elements_[back];
+ }
+
+ // Clears the queue.
+ void Clear() {
+ while (!Empty()) {
+ Pop();
+ }
+ }
+
+ // Returns true if the queue is empty.
+ bool Empty() const { return size_ == 0; }
+
+ // Returns true if the queue is full.
+ bool Full() const { return size_ >= capacity_; }
+
+ // Returns the number of elements in the queue.
+ size_t Size() const { return size_; }
+
+ private:
+ // An array of |capacity| elements. Used as a circular array.
+ std::unique_ptr<T[]> elements_;
+ size_t capacity_ = 0;
+ // The index of the element to be removed by Pop().
+ size_t begin_ = 0;
+ // The index where the new element is inserted by Push().
+ size_t end_ = 0;
+ size_t size_ = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_QUEUE_H_
diff --git a/src/utils/raw_bit_reader.cc b/src/utils/raw_bit_reader.cc
new file mode 100644
index 0000000..15e980d
--- /dev/null
+++ b/src/utils/raw_bit_reader.cc
@@ -0,0 +1,224 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/raw_bit_reader.h"
+
+#include <cassert>
+#include <limits>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+// Note <cinttypes> is only needed when logging is enabled (for the PRI*
+// macros). It depends on the definition of LIBGAV1_ENABLE_LOGGING from
+// logging.h, thus the non-standard header ordering.
+#if LIBGAV1_ENABLE_LOGGING
+#include <cinttypes>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaximumLeb128Size = 8;
+constexpr uint8_t kLeb128ValueByteMask = 0x7f;
+constexpr uint8_t kLeb128TerminationByteMask = 0x80;
+
+uint8_t Mod8(size_t n) {
+ // Last 3 bits are the value of mod 8.
+ return n & 0x07;
+}
+
+size_t DivideBy8(size_t n, bool ceil) { return (n + (ceil ? 7 : 0)) >> 3; }
+
+} // namespace
+
+RawBitReader::RawBitReader(const uint8_t* data, size_t size)
+ : data_(data), bit_offset_(0), size_(size) {
+ assert(data_ != nullptr || size_ == 0);
+}
+
+int RawBitReader::ReadBitImpl() {
+ const size_t byte_offset = DivideBy8(bit_offset_, false);
+ const uint8_t byte = data_[byte_offset];
+ const uint8_t shift = 7 - Mod8(bit_offset_);
+ ++bit_offset_;
+ return static_cast<int>((byte >> shift) & 0x01);
+}
+
+int RawBitReader::ReadBit() {
+ if (Finished()) return -1;
+ return ReadBitImpl();
+}
+
+int64_t RawBitReader::ReadLiteral(int num_bits) {
+ assert(num_bits <= 32);
+ if (!CanReadLiteral(num_bits)) return -1;
+ assert(num_bits > 0);
+ uint32_t literal = 0;
+ int bit = num_bits - 1;
+ do {
+ // ARM can combine a shift operation with a constant number of bits with
+ // some other operations, such as the OR operation.
+ // Here is an ARM disassembly example:
+ // orr w1, w0, w1, lsl #1
+ // which left shifts register w1 by 1 bit and OR the shift result with
+ // register w0.
+ // The next 2 lines are equivalent to:
+ // literal |= static_cast<uint32_t>(ReadBitImpl()) << bit;
+ literal <<= 1;
+ literal |= static_cast<uint32_t>(ReadBitImpl());
+ } while (--bit >= 0);
+ return literal;
+}
+
+bool RawBitReader::ReadInverseSignedLiteral(int num_bits, int* const value) {
+ assert(num_bits + 1 < 32);
+ *value = static_cast<int>(ReadLiteral(num_bits + 1));
+ if (*value == -1) return false;
+ const int sign_bit = 1 << num_bits;
+ if ((*value & sign_bit) != 0) {
+ *value -= 2 * sign_bit;
+ }
+ return true;
+}
+
+bool RawBitReader::ReadLittleEndian(int num_bytes, size_t* const value) {
+ // We must be at a byte boundary.
+ assert(Mod8(bit_offset_) == 0);
+ assert(num_bytes <= 4);
+ static_assert(sizeof(size_t) >= 4, "");
+ if (value == nullptr) return false;
+ size_t byte_offset = DivideBy8(bit_offset_, false);
+ if (Finished() || byte_offset + num_bytes > size_) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits to read Little Endian value.");
+ return false;
+ }
+ *value = 0;
+ for (int i = 0; i < num_bytes; ++i) {
+ const size_t byte = data_[byte_offset];
+ *value |= (byte << (i * 8));
+ ++byte_offset;
+ }
+ bit_offset_ = byte_offset * 8;
+ return true;
+}
+
+bool RawBitReader::ReadUnsignedLeb128(size_t* const value) {
+ // We must be at a byte boundary.
+ assert(Mod8(bit_offset_) == 0);
+ if (value == nullptr) return false;
+ uint64_t value64 = 0;
+ for (int i = 0; i < kMaximumLeb128Size; ++i) {
+ if (Finished()) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits to read LEB128 value.");
+ return false;
+ }
+ const size_t byte_offset = DivideBy8(bit_offset_, false);
+ const uint8_t byte = data_[byte_offset];
+ bit_offset_ += 8;
+ value64 |= static_cast<uint64_t>(byte & kLeb128ValueByteMask) << (i * 7);
+ if ((byte & kLeb128TerminationByteMask) == 0) {
+ if (value64 != static_cast<size_t>(value64) ||
+ value64 > std::numeric_limits<uint32_t>::max()) {
+ LIBGAV1_DLOG(
+ ERROR, "LEB128 value (%" PRIu64 ") exceeded uint32_t maximum (%u).",
+ value64, std::numeric_limits<uint32_t>::max());
+ return false;
+ }
+ *value = static_cast<size_t>(value64);
+ return true;
+ }
+ }
+ LIBGAV1_DLOG(
+ ERROR,
+ "Exceeded kMaximumLeb128Size (%d) when trying to read LEB128 value",
+ kMaximumLeb128Size);
+ return false;
+}
+
+bool RawBitReader::ReadUvlc(uint32_t* const value) {
+ if (value == nullptr) return false;
+ int leading_zeros = 0;
+ while (true) {
+ const int bit = ReadBit();
+ if (bit == -1) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+ return false;
+ }
+ if (bit == 1) break;
+ ++leading_zeros;
+ if (leading_zeros == 32) {
+ LIBGAV1_DLOG(ERROR,
+ "Exceeded maximum size (32) when trying to read uvlc value");
+ return false;
+ }
+ }
+ int literal;
+ if (leading_zeros != 0) {
+ literal = static_cast<int>(ReadLiteral(leading_zeros));
+ if (literal == -1) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+ return false;
+ }
+ literal += (1U << leading_zeros) - 1;
+ } else {
+ literal = 0;
+ }
+ *value = literal;
+ return true;
+}
+
+bool RawBitReader::AlignToNextByte() {
+ while ((bit_offset_ & 7) != 0) {
+ if (ReadBit() != 0) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool RawBitReader::VerifyAndSkipTrailingBits(size_t num_bits) {
+ if (ReadBit() != 1) return false;
+ for (size_t i = 0; i < num_bits - 1; ++i) {
+ if (ReadBit() != 0) return false;
+ }
+ return true;
+}
+
+bool RawBitReader::SkipBytes(size_t num_bytes) {
+ // If we are not at a byte boundary, return false.
+ return ((bit_offset_ & 7) != 0) ? false : SkipBits(num_bytes * 8);
+}
+
+bool RawBitReader::SkipBits(size_t num_bits) {
+ // If the reader is already finished, return false.
+ if (Finished()) return false;
+ // If skipping |num_bits| runs out of buffer, return false.
+ const size_t bit_offset = bit_offset_ + num_bits - 1;
+ if (DivideBy8(bit_offset, false) >= size_) return false;
+ bit_offset_ += num_bits;
+ return true;
+}
+
+bool RawBitReader::CanReadLiteral(size_t num_bits) const {
+ if (Finished()) return false;
+ const size_t bit_offset = bit_offset_ + num_bits - 1;
+ return DivideBy8(bit_offset, false) < size_;
+}
+
+bool RawBitReader::Finished() const {
+ return DivideBy8(bit_offset_, false) >= size_;
+}
+
+} // namespace libgav1
diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h
new file mode 100644
index 0000000..76e7bfa
--- /dev/null
+++ b/src/utils/raw_bit_reader.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+class RawBitReader : public BitReader, public Allocable {
+ public:
+ RawBitReader(const uint8_t* data, size_t size);
+ ~RawBitReader() override = default;
+
+ int ReadBit() override;
+ int64_t ReadLiteral(int num_bits) override; // f(n) in the spec.
+ bool ReadInverseSignedLiteral(int num_bits,
+ int* value); // su(1+num_bits) in the spec.
+ bool ReadLittleEndian(int num_bytes,
+ size_t* value); // le(n) in the spec.
+ bool ReadUnsignedLeb128(size_t* value); // leb128() in the spec.
+ // Reads a variable length unsigned number and stores it in |*value|. On a
+ // successful return, |*value| is in the range of 0 to UINT32_MAX − 1,
+ // inclusive.
+ bool ReadUvlc(uint32_t* value); // uvlc() in the spec.
+ bool Finished() const;
+ size_t bit_offset() const { return bit_offset_; }
+ // Return the bytes consumed so far (rounded up).
+ size_t byte_offset() const { return (bit_offset() + 7) >> 3; }
+ size_t size() const { return size_; }
+ // Move to the next byte boundary if not already at one. Return false if any
+ // of the bits being skipped over is non-zero. Return true otherwise. If this
+ // function returns false, the reader is left in an undefined state and must
+ // not be used further. section 5.3.5.
+ bool AlignToNextByte();
+ // Make sure that the trailing bits structure is as expected and skip over it.
+ // section 5.3.4.
+ bool VerifyAndSkipTrailingBits(size_t num_bits);
+ // Skip |num_bytes| bytes. This only works if the current position is at a
+ // byte boundary. The function returns false if the current position is not at
+ // a byte boundary or if skipping |num_bytes| causes the reader to run out of
+ // buffer. Returns true otherwise.
+ bool SkipBytes(size_t num_bytes);
+ // Skip |num_bits| bits. The function returns false if skipping |num_bits|
+ // causes the reader to run out of buffer. Returns true otherwise.
+ bool SkipBits(size_t num_bits);
+
+ private:
+ // Returns true if it is safe to read a literal of size |num_bits|.
+ bool CanReadLiteral(size_t num_bits) const;
+ int ReadBitImpl();
+
+ const uint8_t* const data_;
+ size_t bit_offset_;
+ const size_t size_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
diff --git a/src/utils/reference_info.h b/src/utils/reference_info.h
new file mode 100644
index 0000000..a660791
--- /dev/null
+++ b/src/utils/reference_info.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This struct collects some members related to reference frames in one place to
+// make it easier to pass them as parameters to some dsp functions.
+struct ReferenceInfo {
+ // Initialize |motion_field_reference_frame| so that
+ // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when
+ // the updates are the same as the initialized value.
+ // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
+ // branch conditions in motion field projection.
+ // The following memory initialization of contiguous memory is very fast. It
+ // is not recommended to make the initialization multi-threaded, unless the
+ // memory which needs to be initialized in each thread is still contiguous.
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) {
+ return motion_field_reference_frame.Reset(rows, columns,
+ /*zero_initialize=*/true) &&
+ motion_field_mv.Reset(
+ rows, columns,
+#if LIBGAV1_MSAN
+ // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only
+ // for qualified blocks. In MotionFieldProjectionKernel() dsp
+ // optimizations, it is read no matter it was set or not.
+ /*zero_initialize=*/true
+#else
+ /*zero_initialize=*/false
+#endif
+ );
+ }
+
+ // All members are used by inter frames only.
+ // For intra frames, they are not initialized.
+
+ std::array<uint8_t, kNumReferenceFrameTypes> order_hint;
+
+ // An example when |relative_distance_from| does not equal
+ // -|relative_distance_to|:
+ // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64
+ // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64
+ // This is why we need both |relative_distance_from| and
+ // |relative_distance_to|.
+ // |relative_distance_from|: Relative distances from reference frames to this
+ // frame.
+ std::array<int8_t, kNumReferenceFrameTypes> relative_distance_from;
+ // |relative_distance_to|: Relative distances to reference frames.
+ std::array<int8_t, kNumReferenceFrameTypes> relative_distance_to;
+
+ // Skip motion field projection of specific types of frames if their
+ // |relative_distance_to| is negative or too large.
+ std::array<bool, kNumReferenceFrameTypes> skip_references;
+ // Lookup table to get motion field projection division multiplier of specific
+ // types of frames. Derived from kProjectionMvDivisionLookup.
+ std::array<int16_t, kNumReferenceFrameTypes> projection_divisions;
+
+ // The current frame's |motion_field_reference_frame| and |motion_field_mv_|
+ // are guaranteed to be allocated only when refresh_frame_flags is not 0.
+ // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+ // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
+ Array2D<ReferenceFrameType> motion_field_reference_frame;
+ // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+ // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
+ Array2D<MotionVector> motion_field_mv;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
diff --git a/src/utils/segmentation.cc b/src/utils/segmentation.cc
new file mode 100644
index 0000000..75fa776
--- /dev/null
+++ b/src/utils/segmentation.cc
@@ -0,0 +1,31 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation.h"
+
+namespace libgav1 {
+
+const int8_t kSegmentationFeatureBits[kSegmentFeatureMax] = {8, 6, 6, 6,
+ 6, 3, 0, 0};
+const int kSegmentationFeatureMaxValues[kSegmentFeatureMax] = {
+ 255,
+ kMaxLoopFilterValue,
+ kMaxLoopFilterValue,
+ kMaxLoopFilterValue,
+ kMaxLoopFilterValue,
+ 7,
+ 0,
+ 0};
+
+} // namespace libgav1
diff --git a/src/utils/segmentation.h b/src/utils/segmentation.h
new file mode 100644
index 0000000..67ff74c
--- /dev/null
+++ b/src/utils/segmentation.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+extern const int8_t kSegmentationFeatureBits[kSegmentFeatureMax];
+extern const int kSegmentationFeatureMaxValues[kSegmentFeatureMax];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_SEGMENTATION_H_
diff --git a/src/utils/segmentation_map.cc b/src/utils/segmentation_map.cc
new file mode 100644
index 0000000..4284ca2
--- /dev/null
+++ b/src/utils/segmentation_map.cc
@@ -0,0 +1,49 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation_map.h"
+
+#include <cassert>
+#include <cstring>
+#include <new>
+
+namespace libgav1 {
+
+bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) {
+ rows4x4_ = rows4x4;
+ columns4x4_ = columns4x4;
+ segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]);
+ if (segment_id_buffer_ == nullptr) return false;
+ segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get());
+ return true;
+}
+
+void SegmentationMap::Clear() {
+ memset(segment_id_buffer_.get(), 0, rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::CopyFrom(const SegmentationMap& from) {
+ assert(rows4x4_ == from.rows4x4_ && columns4x4_ == from.columns4x4_);
+ memcpy(segment_id_buffer_.get(), from.segment_id_buffer_.get(),
+ rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::FillBlock(int row4x4, int column4x4, int block_width4x4,
+ int block_height4x4, int8_t segment_id) {
+ for (int y = 0; y < block_height4x4; ++y) {
+ memset(&segment_id_[row4x4 + y][column4x4], segment_id, block_width4x4);
+ }
+}
+
+} // namespace libgav1
diff --git a/src/utils/segmentation_map.h b/src/utils/segmentation_map.h
new file mode 100644
index 0000000..499be24
--- /dev/null
+++ b/src/utils/segmentation_map.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// SegmentationMap stores the segment id associated with each 4x4 block in the
+// frame.
+class SegmentationMap {
+ public:
+ SegmentationMap() = default;
+
+ // Not copyable or movable
+ SegmentationMap(const SegmentationMap&) = delete;
+ SegmentationMap& operator=(const SegmentationMap&) = delete;
+
+ // Allocates an internal buffer of the given dimensions to hold the
+ // segmentation map. The memory in the buffer is not initialized. Returns
+ // true on success, false on failure (for example, out of memory).
+ LIBGAV1_MUST_USE_RESULT bool Allocate(int32_t rows4x4, int32_t columns4x4);
+
+ int8_t segment_id(int row4x4, int column4x4) const {
+ return segment_id_[row4x4][column4x4];
+ }
+
+ // Sets every element in the segmentation map to 0.
+ void Clear();
+
+ // Copies the entire segmentation map. |from| must be of the same dimensions.
+ void CopyFrom(const SegmentationMap& from);
+
+ // Sets the region of segmentation map covered by the block to |segment_id|.
+ // The block is located at |row4x4|, |column4x4| and has dimensions
+ // |block_width4x4| and |block_height4x4|.
+ void FillBlock(int row4x4, int column4x4, int block_width4x4,
+ int block_height4x4, int8_t segment_id);
+
+ private:
+ int32_t rows4x4_ = 0;
+ int32_t columns4x4_ = 0;
+
+ // segment_id_ is a rows4x4_ by columns4x4_ 2D array. The underlying data
+ // buffer is dynamically allocated and owned by segment_id_buffer_.
+ std::unique_ptr<int8_t[]> segment_id_buffer_;
+ Array2DView<int8_t> segment_id_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
diff --git a/src/utils/stack.h b/src/utils/stack.h
new file mode 100644
index 0000000..39133b9
--- /dev/null
+++ b/src/utils/stack.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_STACK_H_
+#define LIBGAV1_SRC_UTILS_STACK_H_
+
+#include <cassert>
+#include <utility>
+
+namespace libgav1 {
+
+// A LIFO stack of a fixed capacity. The elements are moved using std::move, so
+// the element type T has to be movable.
+//
+// WARNING: No error checking is performed.
+template <typename T, int capacity>
+class Stack {
+ public:
+ // Pushes the element |value| to the top of the stack. It is an error to call
+ // Push() when the stack is full.
+ void Push(T value) {
+ ++top_;
+ assert(top_ < capacity);
+ elements_[top_] = std::move(value);
+ }
+
+ // Returns the element at the top of the stack and removes it from the stack.
+ // It is an error to call Pop() when the stack is empty.
+ T Pop() {
+ assert(top_ >= 0);
+ return std::move(elements_[top_--]);
+ }
+
+ // Returns true if the stack is empty.
+ bool Empty() const { return top_ < 0; }
+
+ private:
+ static_assert(capacity > 0, "");
+ T elements_[capacity];
+ // The array index of the top of the stack. The stack is empty if top_ is -1.
+ int top_ = -1;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_STACK_H_
diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc
new file mode 100644
index 0000000..8c8f4fe
--- /dev/null
+++ b/src/utils/threadpool.cc
@@ -0,0 +1,323 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/threadpool.h"
+
+#if defined(_MSC_VER)
+#include <process.h>
+#include <windows.h>
+#else // defined(_MSC_VER)
+#include <pthread.h>
+#endif // defined(_MSC_VER)
+#if defined(__ANDROID__) || defined(__GLIBC__)
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+#include <utility>
+
+#if defined(__ANDROID__)
+#include <chrono> // NOLINT (unapproved c++11 header)
+#endif
+
+// The glibc wrapper for the gettid() system call was added in glibc 2.30.
+// Emulate it for older versions of glibc.
+#if defined(__GLIBC_PREREQ)
+#if !__GLIBC_PREREQ(2, 30)
+
+#include <sys/syscall.h>
+
+static pid_t gettid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+
+#endif
+#endif // defined(__GLIBC_PREREQ)
+
+namespace libgav1 {
+
+#if defined(__ANDROID__)
+namespace {
+
+using Clock = std::chrono::steady_clock;
+using Duration = Clock::duration;
+constexpr Duration kBusyWaitDuration =
+ std::chrono::duration_cast<Duration>(std::chrono::duration<double>(2e-3));
+
+} // namespace
+#endif // defined(__ANDROID__)
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(int num_threads) {
+ return Create(/*name_prefix=*/"", num_threads);
+}
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(const char name_prefix[],
+ int num_threads) {
+ if (name_prefix == nullptr || num_threads <= 0) return nullptr;
+ std::unique_ptr<WorkerThread*[]> threads(new (std::nothrow)
+ WorkerThread*[num_threads]);
+ if (threads == nullptr) return nullptr;
+ std::unique_ptr<ThreadPool> pool(new (std::nothrow) ThreadPool(
+ name_prefix, std::move(threads), num_threads));
+ if (pool != nullptr && !pool->StartWorkers()) {
+ pool = nullptr;
+ }
+ return pool;
+}
+
+ThreadPool::ThreadPool(const char name_prefix[],
+ std::unique_ptr<WorkerThread*[]> threads,
+ int num_threads)
+ : threads_(std::move(threads)), num_threads_(num_threads) {
+ threads_[0] = nullptr;
+ assert(name_prefix != nullptr);
+ const size_t name_prefix_len =
+ std::min(strlen(name_prefix), sizeof(name_prefix_) - 1);
+ memcpy(name_prefix_, name_prefix, name_prefix_len);
+ name_prefix_[name_prefix_len] = '\0';
+}
+
+ThreadPool::~ThreadPool() { Shutdown(); }
+
+void ThreadPool::Schedule(std::function<void()> closure) {
+ LockMutex();
+ if (!queue_.GrowIfNeeded()) {
+ // queue_ is full and we can't grow it. Run |closure| directly.
+ UnlockMutex();
+ closure();
+ return;
+ }
+ queue_.Push(std::move(closure));
+ UnlockMutex();
+ SignalOne();
+}
+
+int ThreadPool::num_threads() const { return num_threads_; }
+
+// A simple implementation that mirrors the non-portable Thread. We may
+// choose to expand this in the future as a portable implementation of
+// Thread, or replace it at such a time as one is implemented.
+class ThreadPool::WorkerThread : public Allocable {
+ public:
+ // Creates and starts a thread that runs pool->WorkerFunction().
+ explicit WorkerThread(ThreadPool* pool);
+
+ // Not copyable or movable.
+ WorkerThread(const WorkerThread&) = delete;
+ WorkerThread& operator=(const WorkerThread&) = delete;
+
+ // REQUIRES: Join() must have been called if Start() was called and
+ // succeeded.
+ ~WorkerThread() = default;
+
+ LIBGAV1_MUST_USE_RESULT bool Start();
+
+ // Joins with the running thread.
+ void Join();
+
+ private:
+#if defined(_MSC_VER)
+ static unsigned int __stdcall ThreadBody(void* arg);
+#else
+ static void* ThreadBody(void* arg);
+#endif
+
+ void SetupName();
+ void Run();
+
+ ThreadPool* pool_;
+#if defined(_MSC_VER)
+ HANDLE handle_;
+#else
+ pthread_t thread_;
+#endif
+};
+
+ThreadPool::WorkerThread::WorkerThread(ThreadPool* pool) : pool_(pool) {}
+
+#if defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+ // Since our code calls the C run-time library (CRT), use _beginthreadex
+ // rather than CreateThread. Microsoft documentation says "If a thread
+ // created using CreateThread calls the CRT, the CRT may terminate the
+ // process in low-memory conditions."
+ uintptr_t handle = _beginthreadex(
+ /*security=*/nullptr, /*stack_size=*/0, ThreadBody, this,
+ /*initflag=*/CREATE_SUSPENDED, /*thrdaddr=*/nullptr);
+ if (handle == 0) return false;
+ handle_ = reinterpret_cast<HANDLE>(handle);
+ ResumeThread(handle_);
+ return true;
+}
+
+void ThreadPool::WorkerThread::Join() {
+ WaitForSingleObject(handle_, INFINITE);
+ CloseHandle(handle_);
+}
+
+unsigned int ThreadPool::WorkerThread::ThreadBody(void* arg) {
+ auto* thread = static_cast<WorkerThread*>(arg);
+ thread->Run();
+ return 0;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+ // Not currently supported on Windows.
+}
+
+#else // defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+ return pthread_create(&thread_, nullptr, ThreadBody, this) == 0;
+}
+
+void ThreadPool::WorkerThread::Join() { pthread_join(thread_, nullptr); }
+
+void* ThreadPool::WorkerThread::ThreadBody(void* arg) {
+ auto* thread = static_cast<WorkerThread*>(arg);
+ thread->Run();
+ return nullptr;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+ if (pool_->name_prefix_[0] != '\0') {
+#if defined(__APPLE__)
+ // Apple's version of pthread_setname_np takes one argument and operates on
+ // the current thread only. Also, pthread_mach_thread_np is Apple-specific.
+ // The maximum size of the |name| buffer was noted in the Chromium source
+ // code and was confirmed by experiments.
+ char name[64];
+ mach_port_t id = pthread_mach_thread_np(pthread_self());
+ int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+ static_cast<int64_t>(id));
+ assert(rv >= 0);
+ rv = pthread_setname_np(name);
+ assert(rv == 0);
+ static_cast<void>(rv);
+#elif defined(__ANDROID__) || defined(__GLIBC__)
+ // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
+ // with error 34 (ERANGE) on Android.
+ char name[16];
+ pid_t id = gettid();
+ int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+ static_cast<int64_t>(id));
+ assert(rv >= 0);
+ rv = pthread_setname_np(pthread_self(), name);
+ assert(rv == 0);
+ static_cast<void>(rv);
+#endif
+ }
+}
+
+#endif // defined(_MSC_VER)
+
+void ThreadPool::WorkerThread::Run() {
+ SetupName();
+ pool_->WorkerFunction();
+}
+
+bool ThreadPool::StartWorkers() {
+ if (!queue_.Init()) return false;
+ for (int i = 0; i < num_threads_; ++i) {
+ threads_[i] = new (std::nothrow) WorkerThread(this);
+ if (threads_[i] == nullptr) return false;
+ if (!threads_[i]->Start()) {
+ delete threads_[i];
+ threads_[i] = nullptr;
+ return false;
+ }
+ }
+ return true;
+}
+
+void ThreadPool::WorkerFunction() {
+ LockMutex();
+ while (true) {
+ if (queue_.Empty()) {
+ if (exit_threads_) {
+ break; // Queue is empty and exit was requested.
+ }
+#if defined(__ANDROID__)
+ // On android, if we go to a conditional wait right away, the CPU governor
+ // kicks in and starts shutting the cores down. So we do a very small busy
+ // wait to see if we get our next job within that period. This
+ // significantly improves the performance of common cases of tile parallel
+ // decoding. If we don't receive a job in the busy wait time, we then go
+ // to an actual conditional wait as usual.
+ UnlockMutex();
+ bool found_job = false;
+ const auto wait_start = Clock::now();
+ while (Clock::now() - wait_start < kBusyWaitDuration) {
+ LockMutex();
+ if (!queue_.Empty()) {
+ found_job = true;
+ break;
+ }
+ UnlockMutex();
+ }
+ // If |found_job| is true, we simply continue since we already hold the
+ // mutex and we know for sure that the |queue_| is not empty.
+ if (found_job) continue;
+ // Since |found_job_| was false, the mutex is not being held at this
+ // point.
+ LockMutex();
+ // Ensure that the queue is still empty.
+ if (!queue_.Empty()) continue;
+ if (exit_threads_) {
+ break; // Queue is empty and exit was requested.
+ }
+#endif // defined(__ANDROID__)
+ // Queue is still empty, wait for signal or broadcast.
+ Wait();
+ } else {
+ // Take a job from the queue.
+ std::function<void()> job = std::move(queue_.Front());
+ queue_.Pop();
+
+ UnlockMutex();
+ // Note that it is good practice to surround this with a try/catch so
+ // the thread pool doesn't go to hell if the job throws an exception.
+ // This is omitted here because Google3 doesn't like exceptions.
+ std::move(job)();
+ job = nullptr;
+
+ LockMutex();
+ }
+ }
+ UnlockMutex();
+}
+
+void ThreadPool::Shutdown() {
+ // Tell worker threads how to exit.
+ LockMutex();
+ exit_threads_ = true;
+ UnlockMutex();
+ SignalAll();
+
+ // Join all workers. This will block.
+ for (int i = 0; i < num_threads_; ++i) {
+ if (threads_[i] == nullptr) break;
+ threads_[i]->Join();
+ delete threads_[i];
+ }
+}
+
+} // namespace libgav1
diff --git a/src/utils/threadpool.h b/src/utils/threadpool.h
new file mode 100644
index 0000000..fac875e
--- /dev/null
+++ b/src/utils/threadpool.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_THREADPOOL_H_
+#define LIBGAV1_SRC_UTILS_THREADPOOL_H_
+
+#include <functional>
+#include <memory>
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if !defined(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+#if defined(__ANDROID__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE)
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 1
+#else
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 0
+#endif
+#endif
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <mutex> // NOLINT (unapproved c++11 header)
+#else
+// absl::Mutex & absl::CondVar are significantly faster than the pthread
+// variants on platforms other than Android. iOS may deadlock on Shutdown()
+// using absl, see b/142251739.
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#endif
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/executor.h"
+#include "src/utils/memory.h"
+#include "src/utils/unbounded_queue.h"
+
+namespace libgav1 {
+
+// An implementation of ThreadPool using POSIX threads (pthreads) or Windows
+// threads.
+//
+// - The pool allocates a fixed number of worker threads on instantiation.
+// - The worker threads will pick up work jobs as they arrive.
+// - If all workers are busy, work jobs are queued for later execution.
+//
+// The thread pool is shut down when the pool is destroyed.
+//
+// Example usage of the thread pool:
+// {
+// std::unique_ptr<ThreadPool> pool = ThreadPool::Create(4);
+// for (int i = 0; i < 100; ++i) { // Dispatch 100 jobs.
+// pool->Schedule([&my_data]() { MyFunction(&my_data); });
+// }
+// } // ThreadPool gets destroyed only when all jobs are done.
+class ThreadPool : public Executor, public Allocable {
+ public:
+ // Creates the thread pool with the specified number of worker threads.
+ // If num_threads is 1, the closures are run in FIFO order.
+ static std::unique_ptr<ThreadPool> Create(int num_threads);
+
+ // Like the above factory method, but also sets the name prefix for threads.
+ static std::unique_ptr<ThreadPool> Create(const char name_prefix[],
+ int num_threads);
+
+ // The destructor will shut down the thread pool and all jobs are executed.
+ // Note that after shutdown, the thread pool does not accept further jobs.
+ ~ThreadPool() override;
+
+ // Adds the specified "closure" to the queue for processing. If worker threads
+ // are available, "closure" will run immediately. Otherwise "closure" is
+ // queued for later execution.
+ //
+ // NOTE: If the internal queue is full and cannot be resized because of an
+ // out-of-memory error, the current thread runs "closure" before returning
+ // from Schedule(). For our use cases, this seems better than the
+ // alternatives:
+ // 1. Return a failure status.
+ // 2. Have the current thread wait until the queue is not full.
+ void Schedule(std::function<void()> closure) override;
+
+ int num_threads() const;
+
+ private:
+ class WorkerThread;
+
+ // Creates the thread pool with the specified number of worker threads.
+ // If num_threads is 1, the closures are run in FIFO order.
+ ThreadPool(const char name_prefix[], std::unique_ptr<WorkerThread*[]> threads,
+ int num_threads);
+
+ // Starts the worker pool.
+ LIBGAV1_MUST_USE_RESULT bool StartWorkers();
+
+ void WorkerFunction();
+
+ // Shuts down the thread pool, i.e. worker threads finish their work and
+ // pick up new jobs until the queue is empty. This call will block until
+ // the shutdown is complete.
+ //
+ // Note: If a worker encounters an empty queue after this call, it will exit.
+ // Other workers might still be running, and if the queue fills up again, the
+ // thread pool will continue to operate with a decreased number of workers.
+ // It is up to the caller to prevent adding new jobs.
+ void Shutdown();
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+ void LockMutex() { queue_mutex_.lock(); }
+ void UnlockMutex() { queue_mutex_.unlock(); }
+
+ void Wait() {
+ std::unique_lock<std::mutex> queue_lock(queue_mutex_, std::adopt_lock);
+ condition_.wait(queue_lock);
+ queue_lock.release();
+ }
+
+ void SignalOne() { condition_.notify_one(); }
+ void SignalAll() { condition_.notify_all(); }
+
+ std::condition_variable condition_;
+ std::mutex queue_mutex_;
+
+#else // !LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+ void LockMutex() ABSL_EXCLUSIVE_LOCK_FUNCTION() { queue_mutex_.Lock(); }
+ void UnlockMutex() ABSL_UNLOCK_FUNCTION() { queue_mutex_.Unlock(); }
+ void Wait() { condition_.Wait(&queue_mutex_); }
+ void SignalOne() { condition_.Signal(); }
+ void SignalAll() { condition_.SignalAll(); }
+
+ absl::CondVar condition_;
+ absl::Mutex queue_mutex_;
+
+#endif // LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+ UnboundedQueue<std::function<void()>> queue_ LIBGAV1_GUARDED_BY(queue_mutex_);
+ // If not all the worker threads are created, the first entry after the
+ // created worker threads is a null pointer.
+ const std::unique_ptr<WorkerThread*[]> threads_;
+
+ bool exit_threads_ LIBGAV1_GUARDED_BY(queue_mutex_) = false;
+ const int num_threads_ = 0;
+ // name_prefix_ is a C string, whose length is restricted to 16 characters,
+ // including the terminating null byte ('\0'). This restriction comes from
+ // the Linux pthread_setname_np() function.
+ char name_prefix_[16];
+};
+
+} // namespace libgav1
+
+#undef LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+#endif // LIBGAV1_SRC_UTILS_THREADPOOL_H_
diff --git a/src/utils/types.h b/src/utils/types.h
new file mode 100644
index 0000000..374f06b
--- /dev/null
+++ b/src/utils/types.h
@@ -0,0 +1,525 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_TYPES_H_
+#define LIBGAV1_SRC_UTILS_TYPES_H_
+
+#include <array>
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+struct MotionVector : public Allocable {
+ static constexpr int kRow = 0;
+ static constexpr int kColumn = 1;
+
+ MotionVector() = default;
+ MotionVector(const MotionVector& mv) = default;
+
+ MotionVector& operator=(const MotionVector& rhs) {
+ mv32 = rhs.mv32;
+ return *this;
+ }
+
+ bool operator==(const MotionVector& rhs) const { return mv32 == rhs.mv32; }
+
+ union {
+ // Motion vectors will always fit in int16_t and using int16_t here instead
+ // of int saves significant memory since some of the frame sized structures
+ // store motion vectors.
+ int16_t mv[2];
+ // A uint32_t view into the |mv| array. Useful for cases where both the
+ // motion vectors have to be copied or compared with a single 32 bit
+ // instruction.
+ uint32_t mv32;
+ };
+};
+
+union CompoundMotionVector {
+ CompoundMotionVector() = default;
+ CompoundMotionVector(const CompoundMotionVector& mv) = default;
+
+ CompoundMotionVector& operator=(const CompoundMotionVector& rhs) {
+ mv64 = rhs.mv64;
+ return *this;
+ }
+
+ bool operator==(const CompoundMotionVector& rhs) const {
+ return mv64 == rhs.mv64;
+ }
+
+ MotionVector mv[2];
+ // A uint64_t view into the |mv| array. Useful for cases where all the motion
+ // vectors have to be copied or compared with a single 64 bit instruction.
+ uint64_t mv64;
+};
+
+// Stores the motion information used for motion field estimation.
+struct TemporalMotionField : public Allocable {
+ Array2D<MotionVector> mv;
+ Array2D<int8_t> reference_offset;
+};
+
+// MvContexts contains the contexts used to decode portions of an inter block
+// mode info to set the y_mode field in BlockParameters.
+//
+// The contexts in the struct correspond to the ZeroMvContext, RefMvContext,
+// and NewMvContext variables in the spec.
+struct MvContexts {
+ int zero_mv;
+ int reference_mv;
+ int new_mv;
+};
+
+struct PaletteModeInfo {
+ uint8_t size[kNumPlaneTypes];
+ uint16_t color[kMaxPlanes][kMaxPaletteSize];
+};
+
+// Stores the parameters used by the prediction process. The members of the
+// struct are filled in when parsing the bitstream and used when the prediction
+// is computed. The information in this struct is associated with a single
+// block.
+// While both BlockParameters and PredictionParameters store information
+// pertaining to a Block, the only difference is that BlockParameters outlives
+// the block itself (for example, some of the variables in BlockParameters are
+// used to compute the context for reading elements in the subsequent blocks).
+struct PredictionParameters : public Allocable {
+ // Restore the index in the unsorted mv stack from the least 3 bits of sorted
+ // |weight_index_stack|.
+ const MotionVector& reference_mv(int stack_index) const {
+ return ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)];
+ }
+ const MotionVector& reference_mv(int stack_index, int mv_index) const {
+ return compound_ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)]
+ .mv[mv_index];
+ }
+
+ void IncreaseWeight(ptrdiff_t index, int weight) {
+ weight_index_stack[index] += weight << 3;
+ }
+
+ void SetWeightIndexStackEntry(int index, int weight) {
+ weight_index_stack[index] = (weight << 3) + 7 - index;
+ }
+
+ bool use_filter_intra;
+ FilterIntraPredictor filter_intra_mode;
+ int angle_delta[kNumPlaneTypes];
+ int8_t cfl_alpha_u;
+ int8_t cfl_alpha_v;
+ int max_luma_width;
+ int max_luma_height;
+ Array2D<uint8_t> color_index_map[kNumPlaneTypes];
+ bool use_intra_block_copy;
+ InterIntraMode inter_intra_mode;
+ bool is_wedge_inter_intra;
+ int wedge_index;
+ int wedge_sign;
+ bool mask_is_inverse;
+ MotionMode motion_mode;
+ CompoundPredictionType compound_prediction_type;
+ union {
+ // |ref_mv_stack| and |compound_ref_mv_stack| are not sorted after
+ // construction. reference_mv() must be called to get the correct element.
+ MotionVector ref_mv_stack[kMaxRefMvStackSize];
+ CompoundMotionVector compound_ref_mv_stack[kMaxRefMvStackSize];
+ };
+ // The least 3 bits of |weight_index_stack| store the index information, and
+ // the other bits store the weight. The index information is actually 7 -
+ // index to make the descending order sort stable (preserves the original
+ // order for elements with the same weight). Sorting an int16_t array is much
+ // faster than sorting a struct array with weight and index stored separately.
+ int16_t weight_index_stack[kMaxRefMvStackSize];
+ // In the spec, the weights of all the nearest mvs are incremented by a bonus
+ // weight which is larger than any natural weight, and later the weights of
+ // the mvs are compared with this bonus weight to determine their contexts. We
+ // replace this procedure by introducing |nearest_mv_count|, which records the
+ // count of the nearest mvs. Since all the nearest mvs are in the beginning of
+ // the mv stack, the index of a mv in the mv stack can be compared with
+ // |nearest_mv_count| to get that mv's context.
+ int nearest_mv_count;
+ int ref_mv_count;
+ int ref_mv_index;
+ MotionVector global_mv[2];
+ int num_warp_samples;
+ int warp_estimate_candidates[kMaxLeastSquaresSamples][4];
+};
+
+// A lot of BlockParameters objects are created, so the smallest type is used
+// for each field. The ranges of some fields are documented to justify why
+// their types are large enough.
+struct BlockParameters : public Allocable {
+ BlockSize size;
+ bool skip;
+ // True means that this block will use some default settings (that
+ // correspond to compound prediction) and so most of the mode info is
+ // skipped. False means that the mode info is not skipped.
+ bool skip_mode;
+ bool is_inter;
+ bool is_explicit_compound_type; // comp_group_idx in the spec.
+ bool is_compound_type_average; // compound_idx in the spec.
+ bool is_global_mv_block;
+ bool use_predicted_segment_id; // only valid with temporal update enabled.
+ int8_t segment_id; // segment_id is in the range [0, 7].
+ PredictionMode y_mode;
+ PredictionMode uv_mode;
+ TransformSize transform_size;
+ TransformSize uv_transform_size;
+ InterpolationFilter interpolation_filter[2];
+ ReferenceFrameType reference_frame[2];
+ // The index of this array is as follows:
+ // 0 - Y plane vertical filtering.
+ // 1 - Y plane horizontal filtering.
+ // 2 - U plane (both directions).
+ // 3 - V plane (both directions).
+ uint8_t deblock_filter_level[kFrameLfCount];
+ CompoundMotionVector mv;
+ PaletteModeInfo palette_mode_info;
+ // When |Tile::split_parse_and_decode_| is true, each block gets its own
+ // instance of |prediction_parameters|. When it is false, all the blocks point
+ // to |Tile::prediction_parameters_|. This field is valid only as long as the
+ // block is *being* decoded. The lifetime and usage of this field can be
+ // better understood by following its flow in tile.cc.
+ std::unique_ptr<PredictionParameters> prediction_parameters;
+};
+
+// A five dimensional array used to store the wedge masks. The dimensions are:
+// - block_size_index (returned by GetWedgeBlockSizeIndex() in prediction.cc).
+// - flip_sign (0 or 1).
+// - wedge_index (0 to 15).
+// - each of those three dimensions is a 2d array of block_width by
+// block_height.
+using WedgeMaskArray =
+ std::array<std::array<std::array<Array2D<uint8_t>, 16>, 2>, 9>;
+
+enum GlobalMotionTransformationType : uint8_t {
+ kGlobalMotionTransformationTypeIdentity,
+ kGlobalMotionTransformationTypeTranslation,
+ kGlobalMotionTransformationTypeRotZoom,
+ kGlobalMotionTransformationTypeAffine,
+ kNumGlobalMotionTransformationTypes
+};
+
+// Global motion and warped motion parameters. See the paper for more info:
+// S. Parker, Y. Chen, D. Barker, P. de Rivaz, D. Mukherjee, "Global and locally
+// adaptive warped motion compensation in video compression", Proc. IEEE
+// International Conference on Image Processing (ICIP), pp. 275-279, Sep. 2017.
+struct GlobalMotion {
+ GlobalMotionTransformationType type;
+ int32_t params[6];
+
+ // Represent two shearing operations. Computed from |params| by SetupShear().
+ //
+ // The least significant six (= kWarpParamRoundingBits) bits are all zeros.
+ // (This means alpha, beta, gamma, and delta could be represented by a 10-bit
+ // signed integer.) The minimum value is INT16_MIN (= -32768) and the maximum
+ // value is 32704 = 0x7fc0, the largest int16_t value whose least significant
+ // six bits are all zeros.
+ //
+ // Valid warp parameters (as validated by SetupShear()) have smaller ranges.
+ // Their absolute values are less than 2^14 (= 16384). (This follows from
+ // the warpValid check at the end of Section 7.11.3.6.)
+ //
+ // NOTE: Section 7.11.3.6 of the spec allows a maximum value of 32768, which
+ // is outside the range of int16_t. When cast to int16_t, 32768 becomes
+ // -32768. This potential int16_t overflow does not matter because either
+ // 32768 or -32768 causes SetupShear() to return false,
+ int16_t alpha;
+ int16_t beta;
+ int16_t gamma;
+ int16_t delta;
+};
+
+// Loop filter parameters:
+//
+// If level[0] and level[1] are both equal to 0, the loop filter process is
+// not invoked.
+//
+// |sharpness| and |delta_enabled| are only used by the loop filter process.
+//
+// The |ref_deltas| and |mode_deltas| arrays are used not only by the loop
+// filter process but also by the reference frame update and loading
+// processes. The loop filter process uses |ref_deltas| and |mode_deltas| only
+// when |delta_enabled| is true.
+struct LoopFilter {
+ // Contains loop filter strength values in the range of [0, 63].
+ std::array<int8_t, kFrameLfCount> level;
+ // Indicates the sharpness level in the range of [0, 7].
+ int8_t sharpness;
+ // Whether the filter level depends on the mode and reference frame used to
+ // predict a block.
+ bool delta_enabled;
+ // Whether additional syntax elements were read that specify which mode and
+ // reference frame deltas are to be updated. loop_filter_delta_update field in
+ // Section 5.9.11 of the spec.
+ bool delta_update;
+ // Contains the adjustment needed for the filter level based on the chosen
+ // reference frame, in the range of [-64, 63].
+ std::array<int8_t, kNumReferenceFrameTypes> ref_deltas;
+ // Contains the adjustment needed for the filter level based on the chosen
+ // mode, in the range of [-64, 63].
+ std::array<int8_t, kLoopFilterMaxModeDeltas> mode_deltas;
+};
+
+struct Delta {
+ bool present;
+ uint8_t scale;
+ bool multi;
+};
+
+struct Cdef {
+ uint8_t damping; // damping value from the spec + (bitdepth - 8).
+ uint8_t bits;
+ // All the strength values are the values from the spec and left shifted by
+ // (bitdepth - 8).
+ uint8_t y_primary_strength[kMaxCdefStrengths];
+ uint8_t y_secondary_strength[kMaxCdefStrengths];
+ uint8_t uv_primary_strength[kMaxCdefStrengths];
+ uint8_t uv_secondary_strength[kMaxCdefStrengths];
+};
+
+struct TileInfo {
+ bool uniform_spacing;
+ int sb_rows;
+ int sb_columns;
+ int tile_count;
+ int tile_columns_log2;
+ int tile_columns;
+ int tile_column_start[kMaxTileColumns + 1];
+ // This field is not used by libgav1, but is populated for use by some
+ // hardware decoders. So it must not be removed.
+ int tile_column_width_in_superblocks[kMaxTileColumns + 1];
+ int tile_rows_log2;
+ int tile_rows;
+ int tile_row_start[kMaxTileRows + 1];
+ // This field is not used by libgav1, but is populated for use by some
+ // hardware decoders. So it must not be removed.
+ int tile_row_height_in_superblocks[kMaxTileRows + 1];
+ int16_t context_update_id;
+ uint8_t tile_size_bytes;
+};
+
+struct LoopRestoration {
+ LoopRestorationType type[kMaxPlanes];
+ int unit_size_log2[kMaxPlanes];
+};
+
+// Stores the quantization parameters of Section 5.9.12.
+struct QuantizerParameters {
+ // base_index is in the range [0, 255].
+ uint8_t base_index;
+ int8_t delta_dc[kMaxPlanes];
+ // delta_ac[kPlaneY] is always 0.
+ int8_t delta_ac[kMaxPlanes];
+ bool use_matrix;
+ // The |matrix_level| array is used only when |use_matrix| is true.
+ // matrix_level[plane] specifies the level in the quantizer matrix that
+ // should be used for decoding |plane|. The quantizer matrix has 15 levels,
+ // from 0 to 14. The range of matrix_level[plane] is [0, 15]. If
+ // matrix_level[plane] is 15, the quantizer matrix is not used.
+ int8_t matrix_level[kMaxPlanes];
+};
+
+// The corresponding segment feature constants in the AV1 spec are named
+// SEG_LVL_xxx.
+enum SegmentFeature : uint8_t {
+ kSegmentFeatureQuantizer,
+ kSegmentFeatureLoopFilterYVertical,
+ kSegmentFeatureLoopFilterYHorizontal,
+ kSegmentFeatureLoopFilterU,
+ kSegmentFeatureLoopFilterV,
+ kSegmentFeatureReferenceFrame,
+ kSegmentFeatureSkip,
+ kSegmentFeatureGlobalMv,
+ kSegmentFeatureMax
+};
+
+struct Segmentation {
+ // 5.11.14.
+ // Returns true if the feature is enabled in the segment.
+ bool FeatureActive(int segment_id, SegmentFeature feature) const {
+ return enabled && segment_id < kMaxSegments &&
+ feature_enabled[segment_id][feature];
+ }
+
+ // Returns true if the feature is signed.
+ static bool FeatureSigned(SegmentFeature feature) {
+ // Only the first five segment features are signed, so this comparison
+ // suffices.
+ return feature <= kSegmentFeatureLoopFilterV;
+ }
+
+ bool enabled;
+ bool update_map;
+ bool update_data;
+ bool temporal_update;
+ // True if the segment id will be read before the skip syntax element. False
+ // if the skip syntax element will be read first.
+ bool segment_id_pre_skip;
+ // The highest numbered segment id that has some enabled feature. Used as
+ // the upper bound for decoding segment ids.
+ int8_t last_active_segment_id;
+
+ bool feature_enabled[kMaxSegments][kSegmentFeatureMax];
+ int16_t feature_data[kMaxSegments][kSegmentFeatureMax];
+ bool lossless[kMaxSegments];
+ // Cached values of get_qindex(1, segmentId), to be consumed by
+ // Tile::ReadTransformType(). The values are in the range [0, 255].
+ uint8_t qindex[kMaxSegments];
+};
+
+// Section 6.8.20.
+// Note: In spec, film grain section uses YCbCr to denote variable names,
+// such as num_cb_points, num_cr_points. To keep it consistent with other
+// parts of code, we use YUV, i.e., num_u_points, num_v_points, etc.
+struct FilmGrainParams {
+ bool apply_grain;
+ bool update_grain;
+ bool chroma_scaling_from_luma;
+ bool overlap_flag;
+ bool clip_to_restricted_range;
+
+ uint8_t num_y_points; // [0, 14].
+ uint8_t num_u_points; // [0, 10].
+ uint8_t num_v_points; // [0, 10].
+ // Must be [0, 255]. 10/12 bit /= 4 or 16. Must be in increasing order.
+ uint8_t point_y_value[14];
+ uint8_t point_y_scaling[14];
+ uint8_t point_u_value[10];
+ uint8_t point_u_scaling[10];
+ uint8_t point_v_value[10];
+ uint8_t point_v_scaling[10];
+
+ uint8_t chroma_scaling; // [8, 11].
+ uint8_t auto_regression_coeff_lag; // [0, 3].
+ int8_t auto_regression_coeff_y[24]; // [-128, 127]
+ int8_t auto_regression_coeff_u[25]; // [-128, 127]
+ int8_t auto_regression_coeff_v[25]; // [-128, 127]
+ // Shift value: auto regression coeffs range
+ // 6: [-2, 2)
+ // 7: [-1, 1)
+ // 8: [-0.5, 0.5)
+ // 9: [-0.25, 0.25)
+ uint8_t auto_regression_shift;
+
+ uint16_t grain_seed;
+ int reference_index;
+ int grain_scale_shift;
+ // These multipliers are encoded as nonnegative values by adding 128 first.
+ // The 128 is subtracted during parsing.
+ int8_t u_multiplier; // [-128, 127]
+ int8_t u_luma_multiplier; // [-128, 127]
+ // These offsets are encoded as nonnegative values by adding 256 first. The
+ // 256 is subtracted during parsing.
+ int16_t u_offset; // [-256, 255]
+ int8_t v_multiplier; // [-128, 127]
+ int8_t v_luma_multiplier; // [-128, 127]
+ int16_t v_offset; // [-256, 255]
+};
+
+struct ObuFrameHeader {
+ uint16_t display_frame_id;
+ uint16_t current_frame_id;
+ int64_t frame_offset;
+ uint16_t expected_frame_id[kNumInterReferenceFrameTypes];
+ int32_t width;
+ int32_t height;
+ int32_t columns4x4;
+ int32_t rows4x4;
+ // The render size (render_width and render_height) is a hint to the
+ // application about the desired display size. It has no effect on the
+ // decoding process.
+ int32_t render_width;
+ int32_t render_height;
+ int32_t upscaled_width;
+ LoopRestoration loop_restoration;
+ uint32_t buffer_removal_time[kMaxOperatingPoints];
+ uint32_t frame_presentation_time;
+ // Note: global_motion[0] (for kReferenceFrameIntra) is not used.
+ std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion;
+ TileInfo tile_info;
+ QuantizerParameters quantizer;
+ Segmentation segmentation;
+ bool show_existing_frame;
+ // frame_to_show is in the range [0, 7]. Only used if show_existing_frame is
+ // true.
+ int8_t frame_to_show;
+ FrameType frame_type;
+ bool show_frame;
+ bool showable_frame;
+ bool error_resilient_mode;
+ bool enable_cdf_update;
+ bool frame_size_override_flag;
+ // The order_hint syntax element in the uncompressed header. If
+ // show_existing_frame is false, the OrderHint variable in the spec is equal
+ // to this field, and so this field can be used in place of OrderHint when
+ // show_existing_frame is known to be false, such as during tile decoding.
+ uint8_t order_hint;
+ int8_t primary_reference_frame;
+ bool render_and_frame_size_different;
+ bool use_superres;
+ uint8_t superres_scale_denominator;
+ bool allow_screen_content_tools;
+ bool allow_intrabc;
+ bool frame_refs_short_signaling;
+ // A bitmask that specifies which reference frame slots will be updated with
+ // the current frame after it is decoded.
+ uint8_t refresh_frame_flags;
+ static_assert(sizeof(ObuFrameHeader::refresh_frame_flags) * 8 ==
+ kNumReferenceFrameTypes,
+ "");
+ bool found_reference;
+ int8_t force_integer_mv;
+ bool allow_high_precision_mv;
+ InterpolationFilter interpolation_filter;
+ bool is_motion_mode_switchable;
+ bool use_ref_frame_mvs;
+ bool enable_frame_end_update_cdf;
+ // True if all segments are losslessly encoded at the coded resolution.
+ bool coded_lossless;
+ // True if all segments are losslessly encoded at the upscaled resolution.
+ bool upscaled_lossless;
+ TxMode tx_mode;
+ // True means that the mode info for inter blocks contains the syntax
+ // element comp_mode that indicates whether to use single or compound
+ // prediction. False means that all inter blocks will use single prediction.
+ bool reference_mode_select;
+ // The frames to use for compound prediction when skip_mode is true.
+ ReferenceFrameType skip_mode_frame[2];
+ bool skip_mode_present;
+ bool reduced_tx_set;
+ bool allow_warped_motion;
+ Delta delta_q;
+ Delta delta_lf;
+ // A valid value of reference_frame_index[i] is in the range [0, 7]. -1
+ // indicates an invalid value.
+ int8_t reference_frame_index[kNumInterReferenceFrameTypes];
+ // The ref_order_hint[ i ] syntax element in the uncompressed header.
+ // Specifies the expected output order hint for each reference frame.
+ uint8_t reference_order_hint[kNumReferenceFrameTypes];
+ LoopFilter loop_filter;
+ Cdef cdef;
+ FilmGrainParams film_grain_params;
+};
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_UTILS_TYPES_H_
diff --git a/src/utils/unbounded_queue.h b/src/utils/unbounded_queue.h
new file mode 100644
index 0000000..fa0d303
--- /dev/null
+++ b/src/utils/unbounded_queue.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+// A FIFO queue of an unbounded capacity.
+//
+// This implementation uses the general approach used in std::deque
+// implementations. See, for example,
+// https://stackoverflow.com/questions/6292332/what-really-is-a-deque-in-stl
+//
+// It is much simpler because it just needs to support the queue interface.
+// The blocks are chained into a circular list, not managed by a "map". It
+// does not shrink the internal buffer.
+//
+// An alternative implementation approach is a resizable circular array. See,
+// for example, ResizingArrayQueue.java in https://algs4.cs.princeton.edu/code/
+// and base::circular_deque in Chromium's base/containers library.
+template <typename T>
+class UnboundedQueue {
+ public:
+ UnboundedQueue() = default;
+
+ // Move only.
+ UnboundedQueue(UnboundedQueue&& other)
+ : first_block_(other.first_block_),
+ front_(other.front_),
+ last_block_(other.last_block_),
+ back_(other.back_) {
+ other.first_block_ = nullptr;
+ other.front_ = 0;
+ other.last_block_ = nullptr;
+ other.back_ = 0;
+ }
+ UnboundedQueue& operator=(UnboundedQueue&& other) {
+ if (this != &other) {
+ Destroy();
+ first_block_ = other.first_block_;
+ front_ = other.front_;
+ last_block_ = other.last_block_;
+ back_ = other.back_;
+ other.first_block_ = nullptr;
+ other.front_ = 0;
+ other.last_block_ = nullptr;
+ other.back_ = 0;
+ }
+ return *this;
+ }
+
+ ~UnboundedQueue() { Destroy(); }
+
+ // Allocates two Blocks upfront because most access patterns require at
+ // least two Blocks. Returns false if the allocation of the Blocks failed.
+ LIBGAV1_MUST_USE_RESULT bool Init() {
+ std::unique_ptr<Block> new_block0(new (std::nothrow) Block);
+ std::unique_ptr<Block> new_block1(new (std::nothrow) Block);
+ if (new_block0 == nullptr || new_block1 == nullptr) return false;
+ first_block_ = last_block_ = new_block0.release();
+ new_block1->next = first_block_;
+ last_block_->next = new_block1.release();
+ return true;
+ }
+
+ // Checks if the queue has room for a new element. If the queue is full,
+ // tries to grow it. Returns false if the queue is full and the attempt to
+ // grow it failed.
+ //
+ // NOTE: GrowIfNeeded() must be called before each call to Push(). This
+ // inconvenient design is necessary to guarantee a successful Push() call.
+ //
+ // Push(T&& value) is often called with the argument std::move(value). The
+ // moved-from object |value| won't be usable afterwards, so it would be
+ // problematic if Push(T&& value) failed and we lost access to the original
+ // |value| object.
+ LIBGAV1_MUST_USE_RESULT bool GrowIfNeeded() {
+ assert(last_block_ != nullptr);
+ if (back_ == kBlockCapacity) {
+ if (last_block_->next == first_block_) {
+ // All Blocks are in use.
+ std::unique_ptr<Block> new_block(new (std::nothrow) Block);
+ if (new_block == nullptr) return false;
+ new_block->next = first_block_;
+ last_block_->next = new_block.release();
+ }
+ last_block_ = last_block_->next;
+ back_ = 0;
+ }
+ return true;
+ }
+
+ // Pushes the element |value| to the end of the queue. It is an error to call
+ // Push() when the queue is full.
+ void Push(const T& value) {
+ assert(last_block_ != nullptr);
+ assert(back_ < kBlockCapacity);
+ T* elements = reinterpret_cast<T*>(last_block_->buffer);
+ new (&elements[back_++]) T(value);
+ }
+
+ void Push(T&& value) {
+ assert(last_block_ != nullptr);
+ assert(back_ < kBlockCapacity);
+ T* elements = reinterpret_cast<T*>(last_block_->buffer);
+ new (&elements[back_++]) T(std::move(value));
+ }
+
+ // Returns the element at the front of the queue. It is an error to call
+ // Front() when the queue is empty.
+ T& Front() {
+ assert(!Empty());
+ T* elements = reinterpret_cast<T*>(first_block_->buffer);
+ return elements[front_];
+ }
+
+ const T& Front() const {
+ assert(!Empty());
+ T* elements = reinterpret_cast<T*>(first_block_->buffer);
+ return elements[front_];
+ }
+
+ // Removes the element at the front of the queue from the queue. It is an
+ // error to call Pop() when the queue is empty.
+ void Pop() {
+ assert(!Empty());
+ T* elements = reinterpret_cast<T*>(first_block_->buffer);
+ elements[front_++].~T();
+ if (front_ == kBlockCapacity) {
+ // The first block has become empty.
+ front_ = 0;
+ if (first_block_ == last_block_) {
+ // Only one Block is in use. Simply reset back_.
+ back_ = 0;
+ } else {
+ first_block_ = first_block_->next;
+ }
+ }
+ }
+
+ // Returns true if the queue is empty.
+ bool Empty() const { return first_block_ == last_block_ && front_ == back_; }
+
+ private:
+ // kBlockCapacity is the maximum number of elements each Block can hold.
+ // sizeof(void*) is subtracted from 2048 to account for the |next| pointer in
+ // the Block struct.
+ //
+ // In Linux x86_64, sizeof(std::function<void()>) is 32, so each Block can
+ // hold 63 std::function<void()> objects.
+ //
+ // NOTE: The corresponding value in <deque> in libc++ revision
+ // 245b5ba3448b9d3f6de5962066557e253a6bc9a4 is:
+ // template <class _ValueType, class _DiffType>
+ // struct __deque_block_size {
+ // static const _DiffType value =
+ // sizeof(_ValueType) < 256 ? 4096 / sizeof(_ValueType) : 16;
+ // };
+ //
+ // Note that 4096 / 256 = 16, so apparently this expression is intended to
+ // ensure the block size is at least 4096 bytes and each block can hold at
+ // least 16 elements.
+ static constexpr size_t kBlockCapacity =
+ (sizeof(T) < 128) ? (2048 - sizeof(void*)) / sizeof(T) : 16;
+
+ struct Block : public Allocable {
+ alignas(T) char buffer[kBlockCapacity * sizeof(T)];
+ Block* next;
+ };
+
+ void Destroy() {
+ if (first_block_ == nullptr) return; // An uninitialized queue.
+
+ // First free the unused blocks, which are located after last_block and
+ // before first_block_.
+ Block* block = last_block_->next;
+ // Cut the circular list open after last_block_.
+ last_block_->next = nullptr;
+ while (block != first_block_) {
+ Block* next = block->next;
+ delete block;
+ block = next;
+ }
+
+ // Then free the used blocks. Destruct the elements in the used blocks.
+ while (block != nullptr) {
+ const size_t begin = (block == first_block_) ? front_ : 0;
+ const size_t end = (block == last_block_) ? back_ : kBlockCapacity;
+ T* elements = reinterpret_cast<T*>(block->buffer);
+ for (size_t i = begin; i < end; ++i) {
+ elements[i].~T();
+ }
+ Block* next = block->next;
+ delete block;
+ block = next;
+ }
+ }
+
+ // Blocks are chained in a circular singly-linked list. If the list of Blocks
+ // is empty, both first_block_ and last_block_ are null pointers. If the list
+ // is nonempty, first_block_ points to the first used Block and last_block_
+ // points to the last used Block.
+ //
+ // Invariant: If Init() is called and succeeds, the queue is always nonempty.
+ // This allows all methods (except the destructor) to avoid null pointer
+ // checks for first_block_ and last_block_.
+ Block* first_block_ = nullptr;
+ // The index of the element in first_block_ to be removed by Pop().
+ size_t front_ = 0;
+ Block* last_block_ = nullptr;
+ // The index in last_block_ where the new element is inserted by Push().
+ size_t back_ = 0;
+};
+
+#if !LIBGAV1_CXX17
+template <typename T>
+constexpr size_t UnboundedQueue<T>::kBlockCapacity;
+#endif
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
diff --git a/src/utils/vector.h b/src/utils/vector.h
new file mode 100644
index 0000000..e211240
--- /dev/null
+++ b/src/utils/vector.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// libgav1::Vector implementation
+
+#ifndef LIBGAV1_SRC_UTILS_VECTOR_H_
+#define LIBGAV1_SRC_UTILS_VECTOR_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace internal {
+
+static constexpr size_t kMinVectorAllocation = 16;
+
+// Returns the smallest power of two greater or equal to 'value'.
+inline size_t NextPow2(size_t value) {
+ if (value == 0) return 0;
+ --value;
+ for (size_t i = 1; i < sizeof(size_t) * 8; i *= 2) value |= value >> i;
+ return value + 1;
+}
+
+// Returns the smallest capacity greater or equal to 'value'.
+inline size_t NextCapacity(size_t value) {
+ if (value == 0) return 0;
+ if (value <= kMinVectorAllocation) return kMinVectorAllocation;
+ return NextPow2(value);
+}
+
+//------------------------------------------------------------------------------
+// Data structure equivalent to std::vector but returning false and to its last
+// valid state on memory allocation failure.
+// std::vector with a custom allocator does not fill this need without
+// exceptions.
+
+template <typename T>
+class VectorBase {
+ public:
+ using iterator = T*;
+ using const_iterator = const T*;
+
+ VectorBase() noexcept = default;
+ // Move only.
+ VectorBase(const VectorBase&) = delete;
+ VectorBase& operator=(const VectorBase&) = delete;
+ VectorBase(VectorBase&& other) noexcept
+ : items_(other.items_),
+ capacity_(other.capacity_),
+ num_items_(other.num_items_) {
+ other.items_ = nullptr;
+ other.capacity_ = 0;
+ other.num_items_ = 0;
+ }
+ VectorBase& operator=(VectorBase&& other) noexcept {
+ if (this != &other) {
+ clear();
+ free(items_);
+ items_ = other.items_;
+ capacity_ = other.capacity_;
+ num_items_ = other.num_items_;
+ other.items_ = nullptr;
+ other.capacity_ = 0;
+ other.num_items_ = 0;
+ }
+ return *this;
+ }
+ ~VectorBase() {
+ clear();
+ free(items_);
+ }
+
+ // Reallocates just enough memory if needed so that 'new_cap' items can fit.
+ LIBGAV1_MUST_USE_RESULT bool reserve(size_t new_cap) {
+ if (capacity_ < new_cap) {
+ T* const new_items = static_cast<T*>(malloc(new_cap * sizeof(T)));
+ if (new_items == nullptr) return false;
+ if (num_items_ > 0) {
+ if (std::is_trivial<T>::value) {
+ // Cast |new_items| and |items_| to void* to avoid the GCC
+ // -Wclass-memaccess warning and additionally the
+ // bugprone-undefined-memory-manipulation clang-tidy warning. The
+ // memcpy is safe because T is a trivial type.
+ memcpy(static_cast<void*>(new_items),
+ static_cast<const void*>(items_), num_items_ * sizeof(T));
+ } else {
+ for (size_t i = 0; i < num_items_; ++i) {
+ new (&new_items[i]) T(std::move(items_[i]));
+ items_[i].~T();
+ }
+ }
+ }
+ free(items_);
+ items_ = new_items;
+ capacity_ = new_cap;
+ }
+ return true;
+ }
+
+ // Reallocates less memory so that only the existing items can fit.
+ bool shrink_to_fit() {
+ if (capacity_ == num_items_) return true;
+ if (num_items_ == 0) {
+ free(items_);
+ items_ = nullptr;
+ capacity_ = 0;
+ return true;
+ }
+ const size_t previous_capacity = capacity_;
+ capacity_ = 0; // Force reserve() to allocate and copy.
+ if (reserve(num_items_)) return true;
+ capacity_ = previous_capacity;
+ return false;
+ }
+
+ // Constructs a new item by copy constructor. May reallocate if
+ // 'resize_if_needed'.
+ LIBGAV1_MUST_USE_RESULT bool push_back(const T& value,
+ bool resize_if_needed = true) {
+ if (num_items_ >= capacity_ &&
+ (!resize_if_needed ||
+ !reserve(internal::NextCapacity(num_items_ + 1)))) {
+ return false;
+ }
+ new (&items_[num_items_]) T(value);
+ ++num_items_;
+ return true;
+ }
+
+ // Constructs a new item by copy constructor. reserve() must have been called
+ // with a sufficient capacity.
+ //
+ // WARNING: No error checking is performed.
+ void push_back_unchecked(const T& value) {
+ assert(num_items_ < capacity_);
+ new (&items_[num_items_]) T(value);
+ ++num_items_;
+ }
+
+ // Constructs a new item by move constructor. May reallocate if
+ // 'resize_if_needed'.
+ LIBGAV1_MUST_USE_RESULT bool push_back(T&& value,
+ bool resize_if_needed = true) {
+ if (num_items_ >= capacity_ &&
+ (!resize_if_needed ||
+ !reserve(internal::NextCapacity(num_items_ + 1)))) {
+ return false;
+ }
+ new (&items_[num_items_]) T(std::move(value));
+ ++num_items_;
+ return true;
+ }
+
+ // Constructs a new item by move constructor. reserve() must have been called
+ // with a sufficient capacity.
+ //
+ // WARNING: No error checking is performed.
+ void push_back_unchecked(T&& value) {
+ assert(num_items_ < capacity_);
+ new (&items_[num_items_]) T(std::move(value));
+ ++num_items_;
+ }
+
+ // Constructs a new item in place by forwarding the arguments args... to the
+ // constructor. May reallocate.
+ template <typename... Args>
+ LIBGAV1_MUST_USE_RESULT bool emplace_back(Args&&... args) {
+ if (num_items_ >= capacity_ &&
+ !reserve(internal::NextCapacity(num_items_ + 1))) {
+ return false;
+ }
+ new (&items_[num_items_]) T(std::forward<Args>(args)...);
+ ++num_items_;
+ return true;
+ }
+
+ // Destructs the last item.
+ void pop_back() {
+ --num_items_;
+ items_[num_items_].~T();
+ }
+
+ // Destructs the item at 'pos'.
+ void erase(iterator pos) { erase(pos, pos + 1); }
+
+ // Destructs the items in [first,last).
+ void erase(iterator first, iterator last) {
+ for (iterator it = first; it != last; ++it) it->~T();
+ if (last != end()) {
+ if (std::is_trivial<T>::value) {
+ // Cast |first| and |last| to void* to avoid the GCC
+ // -Wclass-memaccess warning and additionally the
+ // bugprone-undefined-memory-manipulation clang-tidy warning. The
+ // memmove is safe because T is a trivial type.
+ memmove(static_cast<void*>(first), static_cast<const void*>(last),
+ (end() - last) * sizeof(T));
+ } else {
+ for (iterator it_src = last, it_dst = first; it_src != end();
+ ++it_src, ++it_dst) {
+ new (it_dst) T(std::move(*it_src));
+ it_src->~T();
+ }
+ }
+ }
+ num_items_ -= std::distance(first, last);
+ }
+
+ // Destructs all the items.
+ void clear() { erase(begin(), end()); }
+
+ // Destroys (including deallocating) all the items.
+ void reset() {
+ clear();
+ if (!shrink_to_fit()) assert(false);
+ }
+
+ // Accessors
+ bool empty() const { return (num_items_ == 0); }
+ size_t size() const { return num_items_; }
+ size_t capacity() const { return capacity_; }
+
+ T* data() { return items_; }
+ T& front() { return items_[0]; }
+ T& back() { return items_[num_items_ - 1]; }
+ T& operator[](size_t i) { return items_[i]; }
+ T& at(size_t i) { return items_[i]; }
+ const T* data() const { return items_; }
+ const T& front() const { return items_[0]; }
+ const T& back() const { return items_[num_items_ - 1]; }
+ const T& operator[](size_t i) const { return items_[i]; }
+ const T& at(size_t i) const { return items_[i]; }
+
+ iterator begin() { return &items_[0]; }
+ const_iterator begin() const { return &items_[0]; }
+ iterator end() { return &items_[num_items_]; }
+ const_iterator end() const { return &items_[num_items_]; }
+
+ void swap(VectorBase& b) {
+ // Although not necessary here, adding "using std::swap;" and then calling
+ // swap() without namespace qualification is recommended. See Effective
+ // C++, Item 25.
+ using std::swap;
+ swap(items_, b.items_);
+ swap(capacity_, b.capacity_);
+ swap(num_items_, b.num_items_);
+ }
+
+ protected:
+ T* items_ = nullptr;
+ size_t capacity_ = 0;
+ size_t num_items_ = 0;
+};
+
+} // namespace internal
+
+//------------------------------------------------------------------------------
+
+// Vector class that does *NOT* construct the content on resize().
+// Should be reserved to plain old data.
+template <typename T>
+class VectorNoCtor : public internal::VectorBase<T> {
+ public:
+ // Creates or destructs items so that 'new_num_items' exist.
+ // Allocated memory grows every power-of-two items.
+ LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+ using super = internal::VectorBase<T>;
+ if (super::num_items_ < new_num_items) {
+ if (super::capacity_ < new_num_items) {
+ if (!super::reserve(internal::NextCapacity(new_num_items))) {
+ return false;
+ }
+ }
+ super::num_items_ = new_num_items;
+ } else {
+ while (super::num_items_ > new_num_items) {
+ --super::num_items_;
+ super::items_[super::num_items_].~T();
+ }
+ }
+ return true;
+ }
+};
+
+// This generic vector class will call the constructors.
+template <typename T>
+class Vector : public internal::VectorBase<T> {
+ public:
+ // Constructs or destructs items so that 'new_num_items' exist.
+ // Allocated memory grows every power-of-two items.
+ LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+ using super = internal::VectorBase<T>;
+ if (super::num_items_ < new_num_items) {
+ if (super::capacity_ < new_num_items) {
+ if (!super::reserve(internal::NextCapacity(new_num_items))) {
+ return false;
+ }
+ }
+ while (super::num_items_ < new_num_items) {
+ new (&super::items_[super::num_items_]) T();
+ ++super::num_items_;
+ }
+ } else {
+ while (super::num_items_ > new_num_items) {
+ --super::num_items_;
+ super::items_[super::num_items_].~T();
+ }
+ }
+ return true;
+ }
+};
+
+//------------------------------------------------------------------------------
+
+// Define non-member swap() functions in the namespace in which VectorNoCtor
+// and Vector are implemented. See Effective C++, Item 25.
+
+template <typename T>
+void swap(VectorNoCtor<T>& a, VectorNoCtor<T>& b) {
+ a.swap(b);
+}
+
+template <typename T>
+void swap(Vector<T>& a, Vector<T>& b) {
+ a.swap(b);
+}
+
+//------------------------------------------------------------------------------
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_VECTOR_H_
diff --git a/src/version.cc b/src/version.cc
new file mode 100644
index 0000000..8d1e5a9
--- /dev/null
+++ b/src/version.cc
@@ -0,0 +1,39 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/version.h"
+
+#define LIBGAV1_TOSTRING(x) #x
+#define LIBGAV1_STRINGIFY(x) LIBGAV1_TOSTRING(x)
+#define LIBGAV1_DOT_SEPARATED(M, m, p) M##.##m##.##p
+#define LIBGAV1_DOT_SEPARATED_VERSION(M, m, p) LIBGAV1_DOT_SEPARATED(M, m, p)
+#define LIBGAV1_DOT_VERSION \
+ LIBGAV1_DOT_SEPARATED_VERSION(LIBGAV1_MAJOR_VERSION, LIBGAV1_MINOR_VERSION, \
+ LIBGAV1_PATCH_VERSION)
+
+#define LIBGAV1_VERSION_STRING LIBGAV1_STRINGIFY(LIBGAV1_DOT_VERSION)
+
+extern "C" {
+
+int Libgav1GetVersion() { return LIBGAV1_VERSION; }
+const char* Libgav1GetVersionString() { return LIBGAV1_VERSION_STRING; }
+
+const char* Libgav1GetBuildConfiguration() {
+ // TODO(jzern): cmake can generate the detail or in other cases we could
+ // produce one based on the known defines along with the defaults based on
+ // the toolchain, e.g., LIBGAV1_ENABLE_NEON from cpu.h.
+ return "Not available.";
+}
+
+} // extern "C"
diff --git a/src/warp_prediction.cc b/src/warp_prediction.cc
new file mode 100644
index 0000000..dd06317
--- /dev/null
+++ b/src/warp_prediction.cc
@@ -0,0 +1,244 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/warp_prediction.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWarpModelTranslationClamp = 1 << 23;
+constexpr int kWarpModelAffineClamp = 1 << 13;
+constexpr int kLargestMotionVectorDiff = 256;
+
+constexpr uint16_t kDivisorLookup[257] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192};
+
+// Number of fractional bits of lookup in divisor lookup table.
+constexpr int kDivisorLookupBits = 8;
+// Number of fractional bits of entries in divisor lookup table.
+constexpr int kDivisorLookupPrecisionBits = 14;
+
+// 7.11.3.7.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+ int16_t* division_shift) {
+ const int n = FloorLog2(std::abs(value));
+ const T e = std::abs(value) - (static_cast<T>(1) << n);
+ const int entry = (n > kDivisorLookupBits)
+ ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+ : static_cast<int>(e << (kDivisorLookupBits - n));
+ *division_shift = n + kDivisorLookupPrecisionBits;
+ *division_factor =
+ (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// 7.11.3.8.
+int LeastSquareProduct(int a, int b) { return ((a * b) >> 2) + a + b; }
+
+// 7.11.3.8.
+int DiagonalClamp(int32_t value) {
+ return Clip3(value,
+ (1 << kWarpedModelPrecisionBits) - kWarpModelAffineClamp + 1,
+ (1 << kWarpedModelPrecisionBits) + kWarpModelAffineClamp - 1);
+}
+
+// 7.11.3.8.
+int NonDiagonalClamp(int32_t value) {
+ return Clip3(value, -kWarpModelAffineClamp + 1, kWarpModelAffineClamp - 1);
+}
+
+int16_t GetShearParameter(int value) {
+ return static_cast<int16_t>(
+ LeftShift(RightShiftWithRoundingSigned(Clip3(value, INT16_MIN, INT16_MAX),
+ kWarpParamRoundingBits),
+ kWarpParamRoundingBits));
+}
+
+} // namespace
+
+bool SetupShear(GlobalMotion* const warp_params) {
+ int16_t division_shift;
+ int16_t division_factor;
+ const auto* const params = warp_params->params;
+ GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+ &division_shift);
+ const int alpha = params[2] - (1 << kWarpedModelPrecisionBits);
+ const int beta = params[3];
+ const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+ const int gamma =
+ RightShiftWithRoundingSigned(v * division_factor, division_shift);
+ const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+ const int delta =
+ params[5] -
+ RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+ (1 << kWarpedModelPrecisionBits);
+
+ warp_params->alpha = GetShearParameter(alpha);
+ warp_params->beta = GetShearParameter(beta);
+ warp_params->gamma = GetShearParameter(gamma);
+ warp_params->delta = GetShearParameter(delta);
+ if ((4 * std::abs(warp_params->alpha) + 7 * std::abs(warp_params->beta) >=
+ (1 << kWarpedModelPrecisionBits)) ||
+ (4 * std::abs(warp_params->gamma) + 4 * std::abs(warp_params->delta) >=
+ (1 << kWarpedModelPrecisionBits))) {
+ return false; // NOLINT (easier condition to understand).
+ }
+
+ return true;
+}
+
+bool WarpEstimation(const int num_samples, const int block_width4x4,
+ const int block_height4x4, const int row4x4,
+ const int column4x4, const MotionVector& mv,
+ const int candidates[kMaxLeastSquaresSamples][4],
+ GlobalMotion* const warp_params) {
+ // |a| fits into int32_t. To avoid cast to int64_t in the following
+ // computation, we declare |a| as int64_t.
+ int64_t a[2][2] = {};
+ int bx[2] = {};
+ int by[2] = {};
+
+ // Note: for simplicity, the spec always uses absolute coordinates
+ // in the warp estimation process. subpixel_mid_x, subpixel_mid_y,
+ // and candidates are relative to the top left of the frame.
+ // In contrast, libaom uses a mixture of coordinate systems.
+ // In av1/common/warped_motion.c:find_affine_int(). The coordinate is relative
+ // to the top left of the block.
+ // mid_y/mid_x: the row/column coordinate of the center of the block.
+ const int mid_y = MultiplyBy4(row4x4) + MultiplyBy2(block_height4x4) - 1;
+ const int mid_x = MultiplyBy4(column4x4) + MultiplyBy2(block_width4x4) - 1;
+ const int subpixel_mid_y = MultiplyBy8(mid_y);
+ const int subpixel_mid_x = MultiplyBy8(mid_x);
+ const int reference_subpixel_mid_y =
+ subpixel_mid_y + mv.mv[MotionVector::kRow];
+ const int reference_subpixel_mid_x =
+ subpixel_mid_x + mv.mv[MotionVector::kColumn];
+
+ for (int i = 0; i < num_samples; ++i) {
+ // candidates[][0] and candidates[][1] are the row/column coordinates of the
+ // sample point in this block, to the top left of the frame.
+ // candidates[][2] and candidates[][3] are the row/column coordinates of the
+ // sample point in this reference block, to the top left of the frame.
+ // sy/sx: the row/column coordinates of the sample point, with center of
+ // the block as origin.
+ const int sy = candidates[i][0] - subpixel_mid_y;
+ const int sx = candidates[i][1] - subpixel_mid_x;
+ // dy/dx: the row/column coordinates of the sample point in the reference
+ // block, with center of the reference block as origin.
+ const int dy = candidates[i][2] - reference_subpixel_mid_y;
+ const int dx = candidates[i][3] - reference_subpixel_mid_x;
+ if (std::abs(sx - dx) < kLargestMotionVectorDiff &&
+ std::abs(sy - dy) < kLargestMotionVectorDiff) {
+ a[0][0] += LeastSquareProduct(sx, sx) + 8;
+ a[0][1] += LeastSquareProduct(sx, sy) + 4;
+ a[1][1] += LeastSquareProduct(sy, sy) + 8;
+ bx[0] += LeastSquareProduct(sx, dx) + 8;
+ bx[1] += LeastSquareProduct(sy, dx) + 4;
+ by[0] += LeastSquareProduct(sx, dy) + 4;
+ by[1] += LeastSquareProduct(sy, dy) + 8;
+ }
+ }
+
+ // a[0][1] == a[1][0], because the matrix is symmetric. We don't have to
+ // compute a[1][0].
+ const int64_t determinant = a[0][0] * a[1][1] - a[0][1] * a[0][1];
+ if (determinant == 0) return false;
+
+ int16_t division_shift;
+ int16_t division_factor;
+ GenerateApproximateDivisor<int64_t>(determinant, &division_factor,
+ &division_shift);
+
+ division_shift -= kWarpedModelPrecisionBits;
+
+ const int64_t params_2 = a[1][1] * bx[0] - a[0][1] * bx[1];
+ const int64_t params_3 = -a[0][1] * bx[0] + a[0][0] * bx[1];
+ const int64_t params_4 = a[1][1] * by[0] - a[0][1] * by[1];
+ const int64_t params_5 = -a[0][1] * by[0] + a[0][0] * by[1];
+ auto* const params = warp_params->params;
+
+ if (division_shift <= 0) {
+ division_factor <<= -division_shift;
+ params[2] = static_cast<int32_t>(params_2) * division_factor;
+ params[3] = static_cast<int32_t>(params_3) * division_factor;
+ params[4] = static_cast<int32_t>(params_4) * division_factor;
+ params[5] = static_cast<int32_t>(params_5) * division_factor;
+ } else {
+ params[2] = RightShiftWithRoundingSigned(params_2 * division_factor,
+ division_shift);
+ params[3] = RightShiftWithRoundingSigned(params_3 * division_factor,
+ division_shift);
+ params[4] = RightShiftWithRoundingSigned(params_4 * division_factor,
+ division_shift);
+ params[5] = RightShiftWithRoundingSigned(params_5 * division_factor,
+ division_shift);
+ }
+
+ params[2] = DiagonalClamp(params[2]);
+ params[3] = NonDiagonalClamp(params[3]);
+ params[4] = NonDiagonalClamp(params[4]);
+ params[5] = DiagonalClamp(params[5]);
+
+ const int vx =
+ mv.mv[MotionVector::kColumn] * (1 << (kWarpedModelPrecisionBits - 3)) -
+ (mid_x * (params[2] - (1 << kWarpedModelPrecisionBits)) +
+ mid_y * params[3]);
+ const int vy =
+ mv.mv[MotionVector::kRow] * (1 << (kWarpedModelPrecisionBits - 3)) -
+ (mid_x * params[4] +
+ mid_y * (params[5] - (1 << kWarpedModelPrecisionBits)));
+ params[0] =
+ Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+ params[1] =
+ Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+
+ params[6] = 0;
+ params[7] = 0;
+ return true;
+}
+
+} // namespace libgav1
diff --git a/src/warp_prediction.h b/src/warp_prediction.h
new file mode 100644
index 0000000..6c86df3
--- /dev/null
+++ b/src/warp_prediction.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_WARP_PREDICTION_H_
+#define LIBGAV1_SRC_WARP_PREDICTION_H_
+
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Sets the alpha, beta, gamma, delta fields in warp_params using the
+// warp_params->params array as input (only array entries at indexes 2, 3, 4,
+// 5 are used). Returns whether alpha, beta, gamma, delta are valid.
+bool SetupShear(GlobalMotion* warp_params); // 7.11.3.6.
+
+// Computes local warp parameters by performing a least square fit.
+// Returns whether the computed parameters are valid.
+bool WarpEstimation(int num_samples, int block_width4x4, int block_height4x4,
+ int row4x4, int column4x4, const MotionVector& mv,
+ const int candidates[kMaxLeastSquaresSamples][4],
+ GlobalMotion* warp_params); // 7.11.3.8.
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_WARP_PREDICTION_H_
diff --git a/src/yuv_buffer.cc b/src/yuv_buffer.cc
new file mode 100644
index 0000000..c74e140
--- /dev/null
+++ b/src/yuv_buffer.cc
@@ -0,0 +1,201 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/yuv_buffer.h"
+
+#include <cassert>
+#include <cstddef>
+#include <new>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+// Size conventions:
+// * Widths, heights, and border sizes are in pixels.
+// * Strides and plane sizes are in bytes.
+//
+// YuvBuffer objects may be reused through the BufferPool. Realloc() must
+// assume that data members (except buffer_alloc_ and buffer_alloc_size_) may
+// contain stale values from the previous use, and must set all data members
+// from scratch. In particular, Realloc() must not rely on the initial values
+// of data members set by the YuvBuffer constructor.
+bool YuvBuffer::Realloc(int bitdepth, bool is_monochrome, int width, int height,
+ int8_t subsampling_x, int8_t subsampling_y,
+ int left_border, int right_border, int top_border,
+ int bottom_border,
+ GetFrameBufferCallback get_frame_buffer,
+ void* callback_private_data,
+ void** buffer_private_data) {
+ // Only support allocating buffers that have borders that are a multiple of
+ // 2. The border restriction is required because we may subsample the
+ // borders in the chroma planes.
+ if (((left_border | right_border | top_border | bottom_border) & 1) != 0) {
+ LIBGAV1_DLOG(ERROR,
+ "Borders must be a multiple of 2: left_border = %d, "
+ "right_border = %d, top_border = %d, bottom_border = %d.",
+ left_border, right_border, top_border, bottom_border);
+ return false;
+ }
+
+ // Every row in the plane buffers needs to be kFrameBufferRowAlignment-byte
+ // aligned. Since the strides are multiples of kFrameBufferRowAlignment bytes,
+ // it suffices to just make the plane buffers kFrameBufferRowAlignment-byte
+ // aligned.
+ const int plane_align = kFrameBufferRowAlignment;
+ const int uv_width =
+ is_monochrome ? 0 : SubsampledValue(width, subsampling_x);
+ const int uv_height =
+ is_monochrome ? 0 : SubsampledValue(height, subsampling_y);
+ const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+ const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+ const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+ const int uv_bottom_border =
+ is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+ if (get_frame_buffer != nullptr) {
+ assert(buffer_private_data != nullptr);
+
+ const Libgav1ImageFormat image_format =
+ ComposeImageFormat(is_monochrome, subsampling_x, subsampling_y);
+ FrameBuffer frame_buffer;
+ if (get_frame_buffer(callback_private_data, bitdepth, image_format, width,
+ height, left_border, right_border, top_border,
+ bottom_border, kFrameBufferRowAlignment,
+ &frame_buffer) != kStatusOk) {
+ return false;
+ }
+
+ if (frame_buffer.plane[0] == nullptr ||
+ (!is_monochrome && frame_buffer.plane[1] == nullptr) ||
+ (!is_monochrome && frame_buffer.plane[2] == nullptr)) {
+ assert(false && "The get_frame_buffer callback malfunctioned.");
+ LIBGAV1_DLOG(ERROR, "The get_frame_buffer callback malfunctioned.");
+ return false;
+ }
+
+ stride_[kPlaneY] = frame_buffer.stride[0];
+ stride_[kPlaneU] = frame_buffer.stride[1];
+ stride_[kPlaneV] = frame_buffer.stride[2];
+ buffer_[kPlaneY] = frame_buffer.plane[0];
+ buffer_[kPlaneU] = frame_buffer.plane[1];
+ buffer_[kPlaneV] = frame_buffer.plane[2];
+ *buffer_private_data = frame_buffer.private_data;
+ } else {
+ assert(callback_private_data == nullptr);
+ assert(buffer_private_data == nullptr);
+
+ // Calculate y_stride (in bytes). It is padded to a multiple of
+ // kFrameBufferRowAlignment bytes.
+ int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+ y_stride = Align(y_stride, kFrameBufferRowAlignment);
+ // Size of the Y plane in bytes.
+ const uint64_t y_plane_size = (height + top_border + bottom_border) *
+ static_cast<uint64_t>(y_stride) +
+ (plane_align - 1);
+
+ // Calculate uv_stride (in bytes). It is padded to a multiple of
+ // kFrameBufferRowAlignment bytes.
+ int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+ uv_stride = Align(uv_stride, kFrameBufferRowAlignment);
+ // Size of the U or V plane in bytes.
+ const uint64_t uv_plane_size =
+ is_monochrome ? 0
+ : (uv_height + uv_top_border + uv_bottom_border) *
+ static_cast<uint64_t>(uv_stride) +
+ (plane_align - 1);
+
+ // Allocate unaligned y_buffer, u_buffer, and v_buffer.
+ uint8_t* y_buffer = nullptr;
+ uint8_t* u_buffer = nullptr;
+ uint8_t* v_buffer = nullptr;
+
+ const uint64_t frame_size = y_plane_size + 2 * uv_plane_size;
+ if (frame_size > buffer_alloc_size_) {
+ // Allocation to hold larger frame, or first allocation.
+ if (frame_size != static_cast<size_t>(frame_size)) return false;
+
+ buffer_alloc_.reset(new (std::nothrow)
+ uint8_t[static_cast<size_t>(frame_size)]);
+ if (buffer_alloc_ == nullptr) {
+ buffer_alloc_size_ = 0;
+ return false;
+ }
+
+ buffer_alloc_size_ = static_cast<size_t>(frame_size);
+ }
+
+ y_buffer = buffer_alloc_.get();
+ if (!is_monochrome) {
+ u_buffer = y_buffer + y_plane_size;
+ v_buffer = u_buffer + uv_plane_size;
+ }
+
+ stride_[kPlaneY] = y_stride;
+ stride_[kPlaneU] = stride_[kPlaneV] = uv_stride;
+
+ int left_border_bytes = left_border;
+ int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) {
+ left_border_bytes *= sizeof(uint16_t);
+ uv_left_border_bytes *= sizeof(uint16_t);
+ }
+#endif
+ buffer_[kPlaneY] = AlignAddr(
+ y_buffer + (top_border * y_stride) + left_border_bytes, plane_align);
+ buffer_[kPlaneU] =
+ AlignAddr(u_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+ plane_align);
+ buffer_[kPlaneV] =
+ AlignAddr(v_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+ plane_align);
+ }
+
+ y_width_ = width;
+ y_height_ = height;
+ left_border_[kPlaneY] = left_border;
+ right_border_[kPlaneY] = right_border;
+ top_border_[kPlaneY] = top_border;
+ bottom_border_[kPlaneY] = bottom_border;
+
+ uv_width_ = uv_width;
+ uv_height_ = uv_height;
+ left_border_[kPlaneU] = left_border_[kPlaneV] = uv_left_border;
+ right_border_[kPlaneU] = right_border_[kPlaneV] = uv_right_border;
+ top_border_[kPlaneU] = top_border_[kPlaneV] = uv_top_border;
+ bottom_border_[kPlaneU] = bottom_border_[kPlaneV] = uv_bottom_border;
+
+ subsampling_x_ = subsampling_x;
+ subsampling_y_ = subsampling_y;
+
+ bitdepth_ = bitdepth;
+ is_monochrome_ = is_monochrome;
+ assert(!is_monochrome || stride_[kPlaneU] == 0);
+ assert(!is_monochrome || stride_[kPlaneV] == 0);
+ assert(!is_monochrome || buffer_[kPlaneU] == nullptr);
+ assert(!is_monochrome || buffer_[kPlaneV] == nullptr);
+
+ return true;
+}
+
+} // namespace libgav1
diff --git a/src/yuv_buffer.h b/src/yuv_buffer.h
new file mode 100644
index 0000000..b9e8cd3
--- /dev/null
+++ b/src/yuv_buffer.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_YUV_BUFFER_H_
+#define LIBGAV1_SRC_YUV_BUFFER_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+class YuvBuffer {
+ public:
+ // Allocates the buffer. Returns true on success. Returns false on failure.
+ //
+ // * |width| and |height| are the image dimensions in pixels.
+ // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+ // subsampling of the width and height of the chroma planes, respectively.
+ // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+ // the sizes (in pixels) of the borders on the left, right, top, and
+ // bottom sides, respectively. The four border sizes must all be a
+ // multiple of 2.
+ // * If |get_frame_buffer| is not null, it is invoked to allocate the memory.
+ // If |get_frame_buffer| is null, YuvBuffer allocates the memory directly
+ // and ignores the |callback_private_data| and |buffer_private_data|
+ // parameters, which should be null.
+ //
+ // NOTE: The strides are a multiple of 16. Since the first row in each plane
+ // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+ //
+ // Example: bitdepth=8 width=20 height=6 left/right/top/bottom_border=2. The
+ // diagram below shows how Realloc() allocates the data buffer for the Y
+ // plane.
+ //
+ // 16-byte aligned
+ // |
+ // v
+ // ++++++++++++++++++++++++pppppppp
+ // ++++++++++++++++++++++++pppppppp
+ // ++01234567890123456789++pppppppp
+ // ++11234567890123456789++pppppppp
+ // ++21234567890123456789++pppppppp
+ // ++31234567890123456789++pppppppp
+ // ++41234567890123456789++pppppppp
+ // ++51234567890123456789++pppppppp
+ // ++++++++++++++++++++++++pppppppp
+ // ++++++++++++++++++++++++pppppppp
+ // | |
+ // |<-- stride (multiple of 16) ->|
+ //
+ // The video frame has 6 rows of 20 pixels each. Each row is shown as the
+ // pattern r1234567890123456789, where |r| is 0, 1, 2, 3, 4, 5.
+ //
+ // Realloc() first adds a border of 2 pixels around the video frame. The
+ // border pixels are shown as '+'.
+ //
+ // Each row is then padded to a multiple of the default alignment in bytes,
+ // which is 16. The padding bytes are shown as lowercase 'p'. (Since
+ // |bitdepth| is 8 in this example, each pixel is one byte.) The padded size
+ // in bytes is the stride. In this example, the stride is 32 bytes.
+ //
+ // Finally, Realloc() aligns the first byte of frame data, which is the '0'
+ // pixel/byte in the upper left corner of the frame, to the default (16-byte)
+ // alignment boundary.
+ //
+ // TODO(wtc): Add a check for width and height limits to defend against
+ // invalid bitstreams.
+ bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+ int8_t subsampling_x, int8_t subsampling_y, int left_border,
+ int right_border, int top_border, int bottom_border,
+ GetFrameBufferCallback get_frame_buffer,
+ void* callback_private_data, void** buffer_private_data);
+
+ int bitdepth() const { return bitdepth_; }
+
+ bool is_monochrome() const { return is_monochrome_; }
+
+ int8_t subsampling_x() const { return subsampling_x_; }
+ int8_t subsampling_y() const { return subsampling_y_; }
+
+ int width(int plane) const {
+ return (plane == kPlaneY) ? y_width_ : uv_width_;
+ }
+ int height(int plane) const {
+ return (plane == kPlaneY) ? y_height_ : uv_height_;
+ }
+
+ // Returns border sizes in pixels.
+ int left_border(int plane) const { return left_border_[plane]; }
+ int right_border(int plane) const { return right_border_[plane]; }
+ int top_border(int plane) const { return top_border_[plane]; }
+ int bottom_border(int plane) const { return bottom_border_[plane]; }
+
+ // Returns the alignment of frame buffer row in bytes.
+ int alignment() const { return kFrameBufferRowAlignment; }
+
+ // Backup the current set of warnings and disable -Warray-bounds for the
+ // following three functions as the compiler cannot, in all cases, determine
+ // whether |plane| is within [0, kMaxPlanes), e.g., with a variable based for
+ // loop.
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+ // Returns the data buffer for |plane|.
+ uint8_t* data(int plane) {
+ assert(plane >= 0);
+ assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+ return buffer_[plane];
+ }
+ const uint8_t* data(int plane) const {
+ assert(plane >= 0);
+ assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+ return buffer_[plane];
+ }
+
+ // Returns the stride in bytes for |plane|.
+ int stride(int plane) const {
+ assert(plane >= 0);
+ assert(static_cast<size_t>(plane) < std::extent<decltype(stride_)>::value);
+ return stride_[plane];
+ }
+ // Restore the previous set of compiler warnings.
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+ private:
+ static constexpr int kFrameBufferRowAlignment = 16;
+ int bitdepth_ = 0;
+ bool is_monochrome_ = false;
+
+ // y_width_ and y_height_ are the |width| and |height| arguments passed to the
+ // Realloc() method.
+ //
+ // uv_width_ and uv_height_ are computed from y_width_ and y_height_ as
+ // follows:
+ // uv_width_ = (y_width_ + subsampling_x_) >> subsampling_x_
+ // uv_height_ = (y_height_ + subsampling_y_) >> subsampling_y_
+ int y_width_ = 0;
+ int uv_width_ = 0;
+ int y_height_ = 0;
+ int uv_height_ = 0;
+
+ int left_border_[kMaxPlanes] = {};
+ int right_border_[kMaxPlanes] = {};
+ int top_border_[kMaxPlanes] = {};
+ int bottom_border_[kMaxPlanes] = {};
+
+ int stride_[kMaxPlanes] = {};
+ uint8_t* buffer_[kMaxPlanes] = {};
+
+ // buffer_alloc_ and buffer_alloc_size_ are only used if the
+ // get_frame_buffer callback is null and we allocate the buffer ourselves.
+ std::unique_ptr<uint8_t[]> buffer_alloc_;
+ size_t buffer_alloc_size_ = 0;
+
+ int8_t subsampling_x_ = 0; // 0 or 1.
+ int8_t subsampling_y_ = 0; // 0 or 1.
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_YUV_BUFFER_H_
diff --git a/tests/fuzzer/decoder_fuzzer.cc b/tests/fuzzer/decoder_fuzzer.cc
new file mode 100644
index 0000000..236fd3c
--- /dev/null
+++ b/tests/fuzzer/decoder_fuzzer.cc
@@ -0,0 +1,87 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+void Decode(const uint8_t* const data, const size_t size,
+ libgav1::Decoder* const decoder) {
+ decoder->EnqueueFrame(data, size, /*user_private_data=*/0,
+ /*buffer_private_data=*/nullptr);
+ const libgav1::DecoderBuffer* buffer;
+ decoder->DequeueFrame(&buffer);
+}
+
+} // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // Reject large chunks of data to improve fuzzer throughput.
+ if (size > kMaxDataSize) return 0;
+
+ libgav1::Decoder decoder;
+ libgav1::DecoderSettings settings = {};
+ // Use the low byte of the width to seed the number of threads.
+ // We use both nibbles of the lower byte as this results in values != 1 much
+ // more quickly than using the lower nibble alone.
+ settings.threads = (size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1;
+ if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+ // Treat the input as a raw OBU stream.
+ Decode(data, size, &decoder);
+
+ // Use the first frame from an IVF to bypass any read errors from the parser.
+ static constexpr size_t kIvfHeaderSize =
+ libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+ if (size >= kIvfHeaderSize) {
+ Decode(data + kIvfHeaderSize, size - kIvfHeaderSize, &decoder);
+ }
+
+ FuzzerTemporaryFile tempfile(data, size);
+ auto file_reader =
+ libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+ if (file_reader == nullptr) return 0;
+
+ std::vector<uint8_t> buffer;
+ int decoded_frames = 0;
+ do {
+ if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+ Decode(buffer.data(), buffer.size(), &decoder);
+ if (++decoded_frames >= kMaxFrames) break;
+ } while (!file_reader->IsEndOfFile());
+
+ return 0;
+}
diff --git a/tests/fuzzer/decoder_fuzzer_frame_parallel.cc b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
new file mode 100644
index 0000000..d1b1c54
--- /dev/null
+++ b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
@@ -0,0 +1,139 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "src/gav1/status_code.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+using InputBuffer = std::vector<uint8_t>;
+
+struct InputBuffers {
+ ~InputBuffers() {
+ for (auto& buffer : free_buffers) {
+ delete buffer;
+ }
+ }
+ std::deque<InputBuffer*> free_buffers;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* const test = static_cast<InputBuffers*>(callback_private_data);
+ test->free_buffers.push_back(static_cast<InputBuffer*>(buffer_private_data));
+}
+
+} // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // Reject large chunks of data to improve fuzzer throughput.
+ if (size > kMaxDataSize) return 0;
+
+ // Note that |input_buffers| has to outlive the |decoder| object since the
+ // |release_input_buffer| callback could be called on the |decoder|'s
+ // destructor.
+ InputBuffers input_buffers;
+
+ libgav1::Decoder decoder;
+ libgav1::DecoderSettings settings = {};
+ // Use the 33 + low byte of the width to seed the number of threads. This
+ // ensures that we will trigger the frame parallel path in most cases.
+ // We use both nibbles of the lower byte as this results in values != 1 much
+ // more quickly than using the lower nibble alone.
+ settings.threads =
+ 33 + ((size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1);
+
+ settings.frame_parallel = true;
+ settings.blocking_dequeue = true;
+ settings.callback_private_data = &input_buffers;
+ settings.release_input_buffer = ReleaseInputBuffer;
+ if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+ FuzzerTemporaryFile tempfile(data, size);
+ auto file_reader =
+ libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+ if (file_reader == nullptr) return 0;
+
+ InputBuffer* input_buffer = nullptr;
+ bool dequeue_finished = false;
+
+ do {
+ if (input_buffer == nullptr && !file_reader->IsEndOfFile()) {
+ if (input_buffers.free_buffers.empty()) {
+ auto* const buffer = new (std::nothrow) InputBuffer();
+ if (buffer == nullptr) {
+ break;
+ }
+ input_buffers.free_buffers.push_back(buffer);
+ }
+ input_buffer = input_buffers.free_buffers.front();
+ input_buffers.free_buffers.pop_front();
+ if (!file_reader->ReadTemporalUnit(input_buffer, nullptr)) {
+ break;
+ }
+ }
+
+ if (input_buffer != nullptr) {
+ libgav1::StatusCode status =
+ decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+ /*user_private_data=*/0,
+ /*buffer_private_data=*/input_buffer);
+ if (status == libgav1::kStatusOk) {
+ input_buffer = nullptr;
+ // Continue to enqueue frames until we get a kStatusTryAgain status.
+ continue;
+ }
+ if (status != libgav1::kStatusTryAgain) {
+ break;
+ }
+ }
+
+ const libgav1::DecoderBuffer* buffer;
+ libgav1::StatusCode status = decoder.DequeueFrame(&buffer);
+ if (status == libgav1::kStatusNothingToDequeue) {
+ dequeue_finished = true;
+ } else if (status == libgav1::kStatusOk) {
+ dequeue_finished = false;
+ } else {
+ break;
+ }
+ } while (input_buffer != nullptr || !file_reader->IsEndOfFile() ||
+ !dequeue_finished);
+
+ if (input_buffer != nullptr) {
+ input_buffers.free_buffers.push_back(input_buffer);
+ }
+
+ return 0;
+}
diff --git a/tests/fuzzer/fuzzer_temp_file.h b/tests/fuzzer/fuzzer_temp_file.h
new file mode 100644
index 0000000..5d12bbe
--- /dev/null
+++ b/tests/fuzzer/fuzzer_temp_file.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+#define LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+
+// Adapter utility from fuzzer input to a temporary file, for fuzzing APIs that
+// require a file instead of an input buffer.
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+// Pure-C interface for creating and cleaning up temporary files.
+
+static char* fuzzer_get_tmpfile_with_suffix(const uint8_t* data, size_t size,
+ const char* suffix) {
+ if (suffix == NULL) { // NOLINT (this could be a C compilation unit)
+ suffix = "";
+ }
+ const size_t suffix_len = strlen(suffix);
+ if (suffix_len > INT_MAX) { // mkstemps takes int for suffixlen param
+ perror("Suffix too long");
+ abort();
+ }
+
+#ifdef __ANDROID__
+ const char* leading_temp_path =
+ "/data/local/tmp/generate_temporary_file.XXXXXX";
+#else
+ const char* leading_temp_path = "/tmp/generate_temporary_file.XXXXXX";
+#endif
+ const size_t buffer_sz = strlen(leading_temp_path) + suffix_len + 1;
+ char* filename_buffer =
+ (char*)malloc(buffer_sz); // NOLINT (this could be a C compilation unit)
+ if (!filename_buffer) {
+ perror("Failed to allocate file name buffer.");
+ abort();
+ }
+
+ if (snprintf(filename_buffer, buffer_sz, "%s%s", leading_temp_path, suffix) >=
+ buffer_sz) {
+ perror("File name buffer too short.");
+ abort();
+ }
+
+ const int file_descriptor = mkstemps(filename_buffer, suffix_len);
+ if (file_descriptor < 0) {
+ perror("Failed to make temporary file.");
+ abort();
+ }
+ FILE* file = fdopen(file_descriptor, "wb");
+ if (!file) {
+ perror("Failed to open file descriptor.");
+ close(file_descriptor);
+ abort();
+ }
+ const size_t bytes_written = fwrite(data, sizeof(uint8_t), size, file);
+ if (bytes_written < size) {
+ close(file_descriptor);
+ fprintf(stderr, "Failed to write all bytes to file (%zu out of %zu)",
+ bytes_written, size);
+ abort();
+ }
+ fclose(file);
+ return filename_buffer;
+}
+
+static char* fuzzer_get_tmpfile(
+ const uint8_t* data,
+ size_t size) { // NOLINT (people include this .inc file directly)
+ return fuzzer_get_tmpfile_with_suffix(data, size, NULL); // NOLINT
+}
+
+static void fuzzer_release_tmpfile(char* filename) {
+ if (unlink(filename) != 0) {
+ perror("WARNING: Failed to delete temporary file.");
+ }
+ free(filename);
+}
+
+// C++ RAII object for creating temporary files.
+
+#ifdef __cplusplus
+class FuzzerTemporaryFile {
+ public:
+ FuzzerTemporaryFile(const uint8_t* data, size_t size)
+ : original_filename_(fuzzer_get_tmpfile(data, size)) {
+ filename_ = strdup(original_filename_);
+ if (!filename_) {
+ perror("Failed to allocate file name copy.");
+ abort();
+ }
+ }
+
+ FuzzerTemporaryFile(const uint8_t* data, size_t size, const char* suffix)
+ : original_filename_(fuzzer_get_tmpfile_with_suffix(data, size, suffix)) {
+ filename_ = strdup(original_filename_);
+ if (!filename_) {
+ perror("Failed to allocate file name copy.");
+ abort();
+ }
+ }
+
+ ~FuzzerTemporaryFile() {
+ free(filename_);
+ fuzzer_release_tmpfile(original_filename_);
+ }
+
+ FuzzerTemporaryFile(const FuzzerTemporaryFile& other) = delete;
+ FuzzerTemporaryFile operator=(const FuzzerTemporaryFile& other) = delete;
+
+ FuzzerTemporaryFile(const FuzzerTemporaryFile&& other) = delete;
+ FuzzerTemporaryFile operator=(const FuzzerTemporaryFile&& other) = delete;
+
+ const char* filename() const { return filename_; }
+
+ // Returns a mutable pointer to the file name. Should be used sparingly, only
+ // in case the fuzzed API demands it or when making a mutable copy is
+ // inconvenient (e.g., in auto-generated code).
+ char* mutable_filename() const { return filename_; }
+
+ private:
+ char* original_filename_;
+
+ // A mutable copy of the original filename, returned by the accessor. This
+ // guarantees that the original filename can always be used to release the
+ // temporary path.
+ char* filename_;
+};
+#endif // __cplusplus
+#endif // LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
diff --git a/tests/fuzzer/obu_parser_fuzzer.cc b/tests/fuzzer/obu_parser_fuzzer.cc
new file mode 100644
index 0000000..634a802
--- /dev/null
+++ b/tests/fuzzer/obu_parser_fuzzer.cc
@@ -0,0 +1,89 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/decoder_state.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/obu_parser.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames and obus to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+inline void ParseObu(const uint8_t* const data, size_t size) {
+ libgav1::InternalFrameBufferList buffer_list;
+ libgav1::BufferPool buffer_pool(libgav1::OnInternalFrameBufferSizeChanged,
+ libgav1::GetInternalFrameBuffer,
+ libgav1::ReleaseInternalFrameBuffer,
+ &buffer_list);
+ libgav1::DecoderState decoder_state;
+ libgav1::ObuParser parser(data, size, 0, &buffer_pool, &decoder_state);
+ libgav1::RefCountedBufferPtr current_frame;
+ int parsed_frames = 0;
+ while (parser.HasData()) {
+ if (parser.ParseOneFrame(&current_frame) != libgav1::kStatusOk) break;
+ if (++parsed_frames >= kMaxFrames) break;
+ }
+}
+
+} // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // Reject large chunks of data to improve fuzzer throughput.
+ if (size > kMaxDataSize) return 0;
+
+ // Treat the input as a raw OBU stream.
+ ParseObu(data, size);
+
+ // Use the first frame from an IVF to bypass any read errors from the parser.
+ static constexpr size_t kIvfHeaderSize =
+ libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+ if (size >= kIvfHeaderSize) {
+ ParseObu(data + kIvfHeaderSize, size - kIvfHeaderSize);
+ }
+
+ FuzzerTemporaryFile tempfile(data, size);
+ auto file_reader =
+ libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+ if (file_reader == nullptr) return 0;
+
+ std::vector<uint8_t> buffer;
+ int parsed_frames = 0;
+ do {
+ if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+ ParseObu(buffer.data(), buffer.size());
+ if (++parsed_frames >= kMaxFrames) break;
+ } while (!file_reader->IsEndOfFile());
+
+ return 0;
+}